def stop_vnode(self, request, context): logger.info('stop vnode with config: ' + str(request)) taskid = request.taskid username = request.username vnodeid = request.vnodeid brname = request.vnode.network.brname mount_list = request.vnode.mount lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid)) logger.info("Stop the task with lxc:"+lxcname) container = lxc.Container(lxcname) if container.stop(): logger.info("stop container %s success" % lxcname) else: logger.error("stop container %s failed" % lxcname) #umount oss self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list) logger.info("deleting container:%s" % lxcname) if self.imgmgr.deleteFS(lxcname): logger.info("delete container %s success" % lxcname) else: logger.error("delete container %s failed" % lxcname) #del ovs bridge if brname is not None: netcontrol.del_bridge(brname) #release gpu self.release_gpu_device(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def stop_task(self, request, context): logger.info('stop task with config: ' + str(request)) taskid = request.taskid username = request.username vnodeid = request.vnodeid lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid)) logger.info("Stop the task with lxc:"+lxcname) subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def stop_tasks(self, request, context): for msg in request.taskmsgs: lxcname = '%s-batch-%s-%s-%s' % (msg.username, msg.taskid, str(msg.instanceid), msg.token) logger.info("Stop the task with lxc:" + lxcname) subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED, message="")
def start_task(self, request, context): logger.info('start task with config: ' + str(request)) taskid = request.taskid username = request.username vnodeid = request.vnodeid # get config from request command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] pkgpath = request.parameters.command.packagePath envs = request.parameters.command.envVars envs['taskid'] = str(taskid) envs['vnodeid'] = str(vnodeid) timeout = request.timeout token = request.token outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath] lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid)) thread = threading.Thread(target = self.execute_task, args=(username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token)) thread.setDaemon(True) thread.start() return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def process_task(self, request, context): logger.info('excute task with parameter: ' + str(request)) taskid = request.id instanceid = request.instanceid # get config from request command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] pkgpath = request.parameters.command.packagePath envs = request.parameters.command.envVars envs['taskid'] = str(taskid) envs['instanceid'] = str(instanceid) image = {} image['name'] = request.cluster.image.name if request.cluster.image.type == rpc_pb2.Image.PRIVATE: image['type'] = 'private' elif request.cluster.image.type == rpc_pb2.Image.PUBLIC: image['type'] = 'public' else: image['type'] = 'base' image['owner'] = request.cluster.image.owner username = request.username token = request.token lxcname = '%s-batch-%s-%s-%s' % (username, taskid, str(instanceid), token) instance_type = request.cluster.instance mount_list = request.cluster.mount outpath = [ request.parameters.stdoutRedirectPath, request.parameters.stderrRedirectPath ] timeout = request.timeout gpu_need = int(request.cluster.instance.gpu) reused = request.reused #create container [success, ip] = self.create_container(instanceid, username, image, lxcname, instance_type) if not success: return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=ip) #mount oss self.mount_oss("%s/global/users/%s/oss" % (self.fspath, username), mount_list) conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+') mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0" for mount in mount_list: conffile.write("\n" + mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath)) conffile.close() logger.info("Start container %s..." % lxcname) #container = lxc.Container(lxcname) ret = subprocess.run('lxc-start -n %s' % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) if ret.returncode != 0: logger.error('start container %s failed' % lxcname) self.release_ip(ip) self.imgmgr.deleteFS(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message="Can't start the container") logger.info('start container %s success' % lxcname) #add GPU [success, msg] = self.add_gpu_device(lxcname, gpu_need) if not success: logger.error("Fail to add gpu device. " + msg) container.stop() self.release_ip(ip) self.imgmgr.deleteFS(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message="Fail to add gpu device. " + msg) thread = threading.Thread(target=self.execute_task, args=(username, taskid, instanceid, envs, lxcname, pkgpath, command, timeout, outpath, ip, token, mount_list)) thread.setDaemon(True) thread.start() return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED, message="")
def start_vnode(self, request, context): logger.info('start vnode with config: ' + str(request)) taskid = request.taskid vnodeid = request.vnodeid envs = {} envs['taskid'] = str(taskid) envs['vnodeid'] = str(vnodeid) image = {} image['name'] = request.vnode.image.name if request.vnode.image.type == rpc_pb2.Image.PRIVATE: image['type'] = 'private' elif request.vnode.image.type == rpc_pb2.Image.PUBLIC: image['type'] = 'public' else: image['type'] = 'base' image['owner'] = request.vnode.image.owner username = request.username lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid)) instance_type = request.vnode.instance mount_list = request.vnode.mount gpu_need = int(request.vnode.instance.gpu) ipaddr = request.vnode.network.ipaddr gateway = request.vnode.network.gateway brname = request.vnode.network.brname masterip = request.vnode.network.masterip hostname = request.vnode.hostname #create container [success, msg] = self.create_container(taskid, vnodeid, username, image, lxcname, instance_type, ipaddr, gateway, brname, hostname) if not success: return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg) #mount oss lxcpath = "/var/lib/lxc/%s" % lxcname rootfs = lxcpath + "/rootfs" self.mount_oss(lxcpath + "/oss", mount_list) conffile = open(lxcpath + "/config", 'a+') mount_str = "lxc.mount.entry = "+ lxcpath +"/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0" for mount in mount_list: conffile.write("\n"+ mount_str % (mount.provider, mount.remotePath, rootfs, mount.remotePath)) conffile.close() logger.info("Start container %s..." % lxcname) container = lxc.Container(lxcname) ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) if ret.returncode != 0: logger.error('start container %s failed' % lxcname) self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list) self.imgmgr.deleteFS(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container(%s)"%lxcname) logger.info('start container %s success' % lxcname) if masterip != self.worker_ip: netcontrol.setup_gre(brname, masterip) #add GPU [success, msg] = self.add_gpu_device(lxcname,gpu_need) if not success: logger.error("Fail to add gpu device. " + msg) container.stop() self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list) self.imgmgr.deleteFS(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg) #start ssh service cmd = "lxc-attach -n %s -- service ssh start" % lxcname ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) if ret.returncode != 0: logger.error('Fail to start ssh service of container %s' % lxcname) container.stop() self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list) self.imgmgr.deleteFS(lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to start ssh service. lxc(%s)"%lxcname) return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")