def StopVM(self, args): ''' Stop the VM ''' if (self.CheckID(args) == False): return 1 retcode = self.CheckRunStatus(args, "Running") if (retcode != 0): error("Not running") return retcode self.Inform("StopVM") cmd = "aliyuncli ecs StopInstance" cmd += " --InstanceId %s" % args.vm_id retcode, output, errval = self.DoCmd(cmd) if (retcode == 0): status = self.GetRunStatus(args) # The instance becomes "Stopping" after a successful API request, # and the instance becomes "Stopped" after it is stopped successfully. if (status != "Stopping"): buf = "Asked VM to stop, but status = \"%s\"" % (status) error(buf) retcode = 1 else: retcode = self.WaitForRunStatus(args, "Stopped", TIMEOUT_2) return retcode # 0 success, 1 failure
def GetImageId(self, args): # if already have the ID, can skip this step. Note "None" as string from args file if (args.image_id != "" and args.image_id != None and args.image_id != "None"): return 0 # query name, to return id cmd = "aliyuncli ecs DescribeImages" cmd += " --RegionId %s" % args.region cmd += " --ImageName \"%s\"" % args.image_name cmd += " --ImageOwnerAlias %s" % args.image_owner_alias retcode, output, errval = self.DoCmd(cmd) if (retcode != 0): error(errval) return 1 # decode the JSON output decoded_output = json.loads(output) trace(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.image_id = decoded_output['Images']['Image'][0]['ImageId'] return 0
def RestartVM(self, args): # also known as 'RebootInstance' on Alibaba ''' Restarts the VM ''' if (self.CheckID(args) == False): return 1 retcode = self.CheckRunStatus(args, "Running") if (retcode != 0): error("Not running") return retcode self.Inform("RestartVM") cmd = "aliyuncli ecs RebootInstance" cmd += " --InstanceId %s" % args.vm_id retcode, output, errval = self.DoCmd(cmd) # currently running, with Alibaba, status never becomes "un-running" # durring a restart -- so we check when it FAILS to ping to know if # restart actually occured. Then we simply wait till it's back up # again - pingable and ssh-able to know it's running if (retcode == 0): if (args.pingable == 1): retcode = self.WaitForPing(args, False, TIMEOUT_2) else: time.sleep(5) # let VM go down enough so SSH stops (we hope) retcode = 0 # fake success, since ping isn't supported if (retcode != 0): error("never went un-pingable. Did VM restart?") else: retcode = self.WaitTillRunning(args, "Running", TIMEOUT_1) return retcode # 0: succcess, 1: failure
def StopVM(self, args): ''' Stop the VM ''' if (self.CheckID(args) == False): return 1 retcode = self.CheckRunStatus(args, "running") if (retcode != 0): error("Not running") return retcode self.Inform("StopVM") cmd = "aws ec2 stop-instances" cmd += " --instance-id %s" % args.vm_id cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) if (retcode == 0): status = self.GetRunStatus(args) # The instance becomes "Stopping" after a successful API request, # and the instance becomes "Stopped" after it is stopped successfully. if (status != "stopping"): error("Asked VM to stop, but status = \"%s\"" % (status)) retcode = 1 else: retcode = self.WaitForRunStatus(args, "stopped", TIMEOUT_2) return retcode # 0: succcess, 1: failure
def GetRunStatus(self, args): ''' Returns running-state of instance from describe-instance-status ''' if (self.CheckID(args) == False): return 1 cmd = "gcloud --format=\"json\" beta compute" cmd += " instances describe" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note gclould takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems describe VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) run_state = decoded_output['status'] # returns something like "RUNNING" or "STOPPED" self.Inform(run_state) return (run_state)
def ArgSanity(self, parser, args): ''' Alibaba Arg sanity checking ''' rc = 0 if args.bandwidth_out < 1 or args.bandwidth_out > 200: error("bandwidth must be between 1 an 200") rc = 1 return (rc) # 0 good, 1 stop
def GetIPSetupCorrectly(self, args): ''' called after 'running' status to get IP. Does nothing for Alibaba ''' if (args.vm_ip == ""): # this ip value should have been set in Create error("No IP for VM: \"%s\"" % args.vm_name) return (1) # TODO: see if new IP (which we query for RIGHT NOW is different than # the vm_ip that was gathered before. Alibaba is NOT suppose to # change the IP address once it's created for the life of # the VM.. but that's an ass-u-m(e)-tion because it was seen # to move more than once. # return 0
def ShowSecurityGroups(self, args): ''' Displays all current security groups ''' cmd = 'aliyuncli ecs DescribeSecurityGroups' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --PageSize 50" # default is 10, max is 50 cmd += " --output json" cmd += " --filter SecurityGroups.SecurityGroup[].SecurityGroupName" retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 print output # see function below for example of output return (0)
def DeleteSecurityGroup(self, args): ''' deletes the security group ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) if (args.nsg_id == None): error("NSG %s already deleted", args.nsg_name) return (1) cmd = "aws ec2 delete-security-group" cmd += " --group-id %s" % args.nsg_id retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode args.nsg_id = None # remove id from args return (0)
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does''' trace(2, "\"%s\"" % (args.nsg_name)) if (args.nsg_name == "" or args.nsg_name == None or args.nsg_name == "None"): error("NetworkSecurityGroup name is \"%s\"" % args.nsg_name) return 1 # Is there a better way to do this than to pull in the entire dictionary # and iterate through the keys? cmd = "aws ec2 describe-security-groups " # build the AWS command to create an instance cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 decoded_output = json.loads(output) # number of security groups items = len( decoded_output["SecurityGroups"]) # number of security groups # slow search for name for idx in range(0, items): if (decoded_output["SecurityGroups"][idx]["GroupName"] == args.nsg_name): args.nsg_id = decoded_output["SecurityGroups"][idx]["GroupId"] debug( 2, "%2d %-12s \"%s\"" % (idx, decoded_output["SecurityGroups"][idx]["GroupId"], decoded_output["SecurityGroups"][idx]["GroupName"])) return 0 # found it # returns 1 if did not find security group trace(2, "Did not find security group: \"%s\"" % args.nsg_name) return 1
def GetImageId(self, args): cmd = "aws ec2 describe-images" cmd += " --region %s" % args.region cmd += " --filters Name=name,Values=\"%s\"" % args.image_name retcode, output, errval = self.DoCmd(cmd) if (retcode != 0): error(errval) sys.exit(1) # fail to get name, exit script # decode the JSON output decoded_output = json.loads(output) # print json.dumps(decoded_output, indent=4, sort_keys=True) args.image_id = decoded_output['Images'][0]['ImageId'] # ami-8ee326f6 return (0)
def StartVM(self, args): ''' Starts the VM ''' rc = 1 # assume error if (self.CheckID(args) == False): return 1 # get run status and check current state status = self.GetRunStatus(args) if (status == "running"): return 0 # already running, simply return elif (status == "stopping"): buf = "%s is in %s state, can't start running now" % (args.vm_id, status) error(buf) elif (status == "stopped" or status == "null"): rc = 0 # ok to proceed else: buf = "id %s is in \"%s\" state, not sure can start running" % ( args.vm_id, status) error(buf) if (rc != 0): return rc # unexpected status self.Inform("StartVM") # start the VM cmd = "aws ec2 start-instances" cmd += " --instance-id %s" % args.vm_id cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) if (retcode == 0): rc = self.WaitTillRunning(args, "running", TIMEOUT_1) return rc # 0: succcess, 1: failure
def ShowSecurityGroups(self, args): ''' Displays all current security groups ''' cmd = "aws ec2 describe-security-groups " # build the AWS command to create an instance cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 decoded_output = json.loads(output) items = len( decoded_output["SecurityGroups"]) # number of security groups # trace(2, json.dumps(decoded_output["SecurityGroups"][0], 4, sort_keys = True)) # returns a list of security groups. display them for idx in range(0, items): print "%2d %-12s \"%s\" \"%s\"" % ( idx, decoded_output["SecurityGroups"][idx]["GroupId"], decoded_output["SecurityGroups"][idx]["GroupName"], decoded_output["SecurityGroups"][idx]["Description"]) return 0
def RestartVM(self, args): # also known as 'reboot' on aws ''' Restarts the VM ''' if (self.CheckID(args) == False): return 1 retcode = self.CheckRunStatus(args, "running") if (retcode != 0): error("Not running") return retcode self.Inform("RestartVM") cmd = "aws ec2 reboot-instances" cmd += " --instance-id %s" % args.vm_id cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # on aws after "reset", the status never becomes "un-running" # anytime durring the reset procss -- so we check when it FAILS # to ping to know if estart actually occured. Then we simply wait # till it's back up again - pingable and ssh-able to know it's # running if (retcode == 0): if (args.pingable == 1): retcode = self.WaitForPing(args, False, TIMEOUT_2) else: time.sleep(5) # let VM go down enough so SSH stops (we hope) retcode = 0 # fake success, since ping isn't supported if (retcode != 0): error("never went un-pingable. Did VM restart?") else: retcode = self.WaitTillRunning(args, "running", TIMEOUT_1) return retcode # 0: succcess, 1: failure
def DeleteVM(self, args): ''' delete the vm and all the pieces ''' # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): return 1 self.Inform("DeleteVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances delete" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc # allocated them in Create, we probably need to deallocate them here # CSP_Sepecific_Dealloc(stuff...) # Is error handled ok? What if problems deleting? -- instance left around? # # This cleans out everything in the internal args file, so that user must # fully specify any options on the next create. This is the easiest/safest # way to make sure any CSP specific ID parmaters, like the VM id also # get cleared... Really Big hammer, but squishes everything fairly # if (rc == 0): # successful so far? self.Clean( args) # remove file with the persistent id, ip address, .. self.m_args_fname = "" # clear name, so won't write back args when done return rc # 0: succcess, 1: failure
def GetIPSetupCorrectly(self, args): ''' called after 'running' status to get IP. Does nothing for Alibaba ''' # On aws, IP address change across stop/start cases. # # get full description of the instance json record - large # from this we can get the public IP address of the instance cmd = "aws ec2 describe-instances" cmd += " --instance-id %s" % args.vm_id cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # this return json structure from 'describe-instances' has about 50 values # in it that, as the command says, describes the instance. Only need a few # of them here. decoded_output = json.loads(output) args.vm_ip = decoded_output['Reservations'][0]['Instances'][0][ 'PublicDnsName'] key_name = decoded_output['Reservations'][0]['Instances'][0]['KeyName'] debug(1, "ip: %s keyname: \"%s\"" % (args.vm_ip, key_name)) # name of SSH keyfile was sent to Create function when VM was built, and we # get a chance to read it back here. Parinoid check to verify that it is # the same. This should never happen, but check for safety if (key_name != args.key_name): # cross-check error("args.key_name:\"%s\" != version vm thinks its using:\"%s\"", args.key_name, key_name) return 1 return 0
def GetIPSetupCorrectly(self, args): ''' called after 'running' status to get IP. Does nothing for Google ''' # With google, it looks like the IP address gets changed when restarting # from 'stop'. -- SO we must clear it in our stop command ! # # If we don't have IP run "describe" and get it. # If we have it, simply return it if (args.vm_ip != ""): # this ip value should have been set in Create # print "GetIPSetupCorrectly: already have ip:%s" % args.vm_ip return 0 # so we don't need to get it # don't have IP value, hopefully VM is in running state and will # have a IP that we can get cmd = "gcloud --format=\"json\" beta compute" cmd += " instances describe" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems describe VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # ip value that was all that was really needed args.vm_ip = decoded_output['networkInterfaces'][0]['accessConfigs'][ 0]['natIP'] # sanity -- is VM id returned same as what we got from Create? # got the value for free, might as well check it vm_id = decoded_output['id'] if (vm_id != args.vm_id): error("Sanity - Returned vm_id:%s != vm_id value from create: %s" % (vm_id, args.vm_id)) return 1 # check status -- we should be RUNNING status = decoded_output['status'] if (status != "RUNNING"): error("Shouldn't we be RUNNING? -- current status is \"$status\"") return (0)
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 args.vm_ip = "" # make sure IP address is clear # ssh key file, builds path from options, checks existance retcode = self.CheckSSHKeyFilePath(args, ".pem") if (retcode != 0): return (retcode) # security group, create if neeeded, does nothing if already exists # should move this step outside this VM create so that better reflects # real VM timing? retcode = self.CreateNSG(args) # sets args.nsg_id if (retcode != 0): return (retcode) trace(2, "nsg_id: \"%s\" %s" % (args.nsg_name, args.nsg_id)) # look up image-name, return region specific image id # TODO: saw this 'aliyuncli ecs describe-images' fail with network error # check if connection to Alibaba is working before calling this self.Inform("GetImageId") if (self.GetImageId(args) != 0): return 1 trace(2, "image_id: \"%s\" %s" % (args.image_name, args.image_id)) # with security group and image id, we can now create the instance self.Inform("CreateInstance") cmd = 'aliyuncli ecs CreateInstance' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --ImageId %s" % args.image_id # m-rj9gjqbdwtwlhtgqjeov" cmd += " --SecurityGroupId %s" % args.nsg_id # sg-rj999tz2kpxehy7obsjn" cmd += " --InstanceType %s" % args.instance_type # ecs.gn5-c4g1.xlarge cmd += " --InstanceName %s" % args.vm_name # Name to create VM: "newton-gn5-1gpu" cmd += " --InternetMaxBandwidthOut %d" % args.bandwidth_out # 10 cmd += " --InstanceChargeType %s" % args.charge_type # PostPaid cmd += " --KeyPairName %s" % args.key_name # baseos-alibaba-siliconvalley retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return 1 # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_id = decoded_output['InstanceId'] # with Alibaba, Instances created via CLI are not automatically given a public IP address. # To assign a public IP address to the instance you just created # note -- this may not work immediatly after creating VM. try a few times args.vm_ip = "" for retrycnt in range(0, 4): self.Inform("AllocatePublicIpAddress") cmd = 'aliyuncli ecs AllocatePublicIpAddress' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --InstanceId %s" % args.vm_id # i-rj9a0iw25hryafj0fm4v cmd += " 2> /dev/null" # don't show errors (the timeout) retcode, output, errval = self.DoCmdNoError( cmd) # call the Alibaba command, no errors if (retcode == 0): # check for error code decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_ip = decoded_output['IpAddress'] break # got IP we think -- done now trace( 3 - retrycnt, "Problems allocating IP address for %s, retry:%d" % (args.vm_id, retrycnt)) time.sleep(retrycnt) if (args.vm_ip == ""): error("Unable to allocating IP address for \"%s\"" % args.vm_name) return 1 # print "args.vm_ip: %s" % args.vm_ip # save vm ID and other fields setup here so don't use them if error later # do this again later when we are fully started self.ArgSaveToFile(args) # unlike Alibaba or azure, alibaba does not automaticly start an instance # when it is created. Start it here to be consistent retcode = self.StartVM(args) return 0
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 args.vm_ip = "" # make sure IP address is clear # ssh key file, builds path from options, checks existance retcode = self.CheckSSHKeyFilePath(args, ".pem") if (retcode != 0): return (retcode) # security group, create if neeeded, does nothing if already exists # consider moving this step outside this VM create so that better # reflects real VM timing? self.Inform("CreateNSG") if (self.CreateNSG(args) != 0): # sets args.nsg_id return 1 trace(2, "nsg_id: \"%s\" %s" % (args.nsg_name, args.nsg_id)) # look up image-name, return region specific image id self.Inform("GetImageId") if (self.GetImageId(args) != 0): return 1 trace(2, "image_id: \"%s\" %s" % (args.image_name, args.image_id)) # with security group and image id, we can now create the instance self.Inform("run-instances") cmd = "aws ec2 run-instances" # build the AWS command to create an instance cmd += " --image-id %s" % args.image_id # aws image identifer via self.GetImageid() cmd += " --instance-type %s" % args.instance_type # t2.micro cmd += " --region %s" % args.region # us-west-2 cmd += " --key-name %s" % args.key_name # my-security-key cmd += " --security-group-ids %s" % args.nsg_id # Security Group retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return 1 # nothing to delete, can return # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_id = decoded_output['Instances'][0]['InstanceId'] args.vm_ip = "" # don't have IP we see it running # Name your instance! . Done here instead of in run-instances call # it's tricky in bash to get space/qoutes right, at least in original bash code where # this was orginally written.. :-) self.Inform("create-tags") cmd = "aws ec2 create-tags" cmd += " --resource %s" % args.vm_id cmd += " --tags Key=Name,Value=%s" % args.vm_name # unique time-stamped name retcode, output, errval = self.DoCmd(cmd) # wait till the instance is up and running, pingable and ssh-able if (retcode == 0): retcode = self.WaitTillRunning(args, "running", TIMEOUT_1) # save vm ID and other fields setup here so don't use them if error later self.ArgSaveToFile(args) debug(2, "createVM returning %d" % retcode) return retcode # 0: succcess, 1: failure
def time_test(my_class, outer_loop_value, args): ''' generic CSP vm create/stop/start/reset/delete timing test ''' my_time = TimeClass(outer_loop_value) # create/get id for Network Security Group ts = my_time.Start() rc = my_class.CreateNSG(args) my_time.End("createNSG", 0, ts) if (rc != 0): return rc ts = my_time.Start() rc = my_class.CreateVM(args) # args is from parser.parse_args(argv) my_time.End("createVM", 0, ts) if (rc != 0): error("createVM returned %d, stopping test" % rc) return rc # type of VM created - size, number of CPUs, GPUs... defined by name my_time.SetInstanceTypeName(args.instance_type) # start/stop/restart loops, default is 2 loop = 0 # initialize value if loop isn't run (loop_cnt = 0) for loop in range(0, args.inner_loop_cnt): ts = my_time.Start() my_class.StopVM(args) my_time.End("stopVM", loop, ts) time.sleep(5) ts = my_time.Start() my_class.StartVM(args) my_time.End("startVM", loop, ts) time.sleep(5) ts = my_time.Start() my_class.RestartVM(args) my_time.End("restartVM", loop, ts) time.sleep(5) # delete vm ts = my_time.Start() my_class.DeleteVM(args) my_time.End("deleteVM", loop, ts) # delete Security Group time.sleep(5) # for alibaba, need a delay before trying to delete NSG # immediatly after deleting the VM -- the deleteNSG fails ts = my_time.Start() my_class.DeleteNSG(args) my_time.End("deleteNSG", loop, ts) # delete the persistent information - VM/NSG id, name.. my_class.Clean(args) # final report my_time.SummaryInit(my_class, args) # caculate any conclusions.. if (args.summary_report != 0): # extra possiblly redundant my_time.SummaryReport(my_class, args) # but nicely formatted user report my_time.SummaryLog(my_class, args) # cut/pasteable format in log file # successful return return 0
def process_cmd(my_class, argv): # first thing, verify that the connection to the CSP is up and # running correctly (cli app downloaded, user logged in, etc...) rc = my_class.CSPSetupOK() # csp name dependent function if (rc != 0): error( "CSP \"%s\" access is not configured correctly, set it up first" % my_class.ClassName()) return rc # unhappy # create the main command line argument parser class parser = argparse.ArgumentParser( prog='csp', description='CSP simple python interface for %s' % my_class.ClassName()) # common options arguments add_common_options(my_class, parser) # add in positional arguments parser.add_argument('command', help="command to execute, run 'help' for details") parser.add_argument('arguments', help="optional csp specific args run '-h' for details", nargs=argparse.REMAINDER) # class specific arguments my_class.ArgOptions(parser) # csp dependent function # update the defaults with values saved in file if that file exists my_class.ArgRestoreFromFile(parser) # actual argument parser, and any CSP class specific checks # 'args' here contains all the argument and option values in this order # # 1) hardcoded defaults in arg-command, or programaticly determined # 2) overridden by any value specifed in the saved args from last run (if saved) # 3) overridden by any values specified on command line ] # # Then the command is run # # Then, At very end of this function, if commands were successful all the # option values and computed/inquired values like CSP ID values are written # back to a file -- to be picked up in #2 above. args = parser.parse_args(argv) # set global value used for trace level, as 'args' isn't passed around everywhere trace_setlevel(args.trace) # CSP class specific arg checks, # bail here if something isn't set correctly rc = my_class.ArgSanity(parser, args) if (rc != 0): error("In ArgSanity rc:%d" % rc) return (rc) # this is the command that is to be run, pull from the args cmd = args.command # commands to handle the persistent arg list -- if cmd == "clean": my_class.Clean(args) # cleans out args an other cached files return 0 elif cmd == "args": my_class.ArgShowFile() return 0 elif cmd == "help": usage(my_class.m_module_path) return 1 # print args if higher trace level if (trace_do(2)): print vars(args) print "============" print "cmd=%s" % cmd rc = 0 # return value if forget to set below # parse the commands if cmd == "validCSP": rc = 0 # invalid CSP name errors out above elif cmd == "createNSG": rc = my_class.CreateNSG(args) elif cmd == "deleteNSG": rc = my_class.DeleteNSG(args) elif cmd == "showNSGs": rc = my_class.ShowNSGs(args) elif cmd == "createVM": rc = my_class.CreateVM(args) # args is from parser.parse_args(argv) elif cmd == "startVM": rc = my_class.StartVM(args) elif cmd == "stopVM": rc = my_class.StopVM(args) elif cmd == "restartVM": rc = my_class.RestartVM(args) elif cmd == "deleteVM": rc = my_class.DeleteVM(args) elif cmd == "ssh": rc, stdoutstr, stderrstr = my_class.Ssh( args, True, argv[1:]) # args is historical and incl elif cmd == "ping": rc = my_class.Ping(args) elif cmd == "status": rc = my_class.Status(args) elif cmd == "show": rc = my_class.Show(args) elif cmd == "boottime": rc, kernel, user, total = my_class.KernelBootTime(args) if (rc == 0): print("kernel:%s user:%s total:%s" % (kernel, user, total)) elif cmd == "running": rc = my_class.ShowRunning(args) elif cmd == "regions": rc = my_class.ShowRegions(args) elif cmd == "ip": rc = my_class.ShowIP(args) elif cmd == "test": # default is 1 outer create/delete loop if (args.outer_loop_cnt <= 0): error("outer_loop_cnt=0, no tests run") else: for loop in range(0, args.outer_loop_cnt): rc = time_test(my_class, loop, args) if (rc != 0): break time.sleep(30) # time between loops if (rc != 0): error("Test returned %d" % rc) else: error("Undefined command", cmd) usage(my_class.m_module_path) rc = 1 # save all the persistent args values to file after the above commands have # run and modified them -- like the VM or SecurityGroup IDs if (cmd != "DeleteVM"): my_class.ArgSaveToFile(args) if rc == None: # handle "None" return case -- should be an error? error("No return code for cmd \"%s\"" % cmd) rc = 2 return rc # exit code
def StopVM(self, args): ''' Stop the VM ''' # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): return 1 # Checks status. Note that "running" string may be CSP specific retcode = self.CheckRunStatus(args, "RUNNING") # running if (retcode != 0): error("Not running") return retcode # Stop the VM self.Inform("StopVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances stop" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # The CSP may return from the above command once the request # for stopping has been received. However we don't want to # return from this function until we are actually positive that # the VM has compleatly stopped. This check will be CSP specific if (rc == 0): # make sure our persistant IP address is clear - # google changes IP address after stop. So make sure # the next time we need it, we go and ask for it args.vm_ip = "" # get status status = self.GetRunStatus(args) # CSP specific.. # The instance becomes "stopping" after a successful API request, # and the instance becomes "stopped" after it is stopped successfully. if (status != "TERMINATED"): # "stopping" - transiant state error("Asked VM to stop, but status = \"%s\"" % (status)) rc = 1 else: rc = self.WaitForRunStatus(args, "TERMINATED", TIMEOUT_2) # stopped # return 0 only when the VM is fully stopped return rc # 0: succcess, 1: failure
def StartVM(self, args): ''' Starts the VM ''' rc = 1 # assume error # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): # checks for a valid VM id return 1 # # Get run status and check current state # The strings being checked here may be CSP specific. status = self.GetRunStatus(args) if (status == "RUNNING"): return 0 # already running, simply return elif (status == "stopping"): buf = "%s is in %s state, can't start running now" % (args.vm_id, status) error(buf) elif (status == "TERMINATED" or status == "null"): rc = 0 # ok to proceed else: buf = "id %s is in \"%s\" state, not sure can start running" % ( args.vm_id, status) error(buf) if (rc != 0): return rc # unexpected status # start the VM self.Inform("StartVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances start" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # CSP specific - verify that the VM is fully up and running, and that # we have it's IP address and can ssh into it. # # Some CSP's may return from their StartVM in this state, so this call # is optional if (rc == 0): rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # running # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it return rc # 0: succcess, 1: failure
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) error("gcp (google cloud) does not use network security groups") return 1
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does''' trace(2, "\"%s\"" % (args.nsg_name)) error("gcp (google cloud) does not use network security groups") return 0
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) # create security group cmd = 'aliyuncli ecs CreateSecurityGroup' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --SecurityGroupName \"%s\"" % args.nsg_name # "NvidiaSG" retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems creating security group") return 1 # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) security_group_id = decoded_output['SecurityGroupId'] # new Security group ID is saved in the args structure args.nsg_id = security_group_id # A new security group will not have any rules in it. # The following commands will open inbound ports 22 (for SSH), # 443 (for HTTPS), and 5000 (for DIGITS6): cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 22/22 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description SSH' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 443/443 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description HTTPS' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 5000/5000 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description DIGITS6' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol icmp --PortRange -1/-1' # Is value Ok? (-1/8 for Alibaba?) cmd += ' --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description \"Support for ping\"' self.DoCmd(cmd) # The following command will open all outbound ports: cmd = 'aliyuncli ecs AuthorizeSecurityGroupEgress' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol all --PortRange -1/-1 --DestCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description \"All open!\"' retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems setting up security group rules") return 1 return 0 # happy return
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does ''' trace(2, "\"%s\"" % (args.nsg_name)) if (args.nsg_name == "" or args.nsg_name == None or args.nsg_name == "None"): error("NetworkSecurityGroup name is \"%s\"" % args.nsg_name) return 1 # can it be found by name? -- get list of all names first cmd = 'aliyuncli ecs DescribeSecurityGroups' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --PageSize 50" # default is 10, max is 50 cmd += " --output json" cmd += " --filter SecurityGroups.SecurityGroup[].SecurityGroupName" retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 # returns a Json object like: # [ # "NexposeSG", # "NewtonSG", # "sg-rj93y8iuj33uosositpw" # ] # # Use json converter to make it into a list # [u'NexposeSG', u'NewtonSG', u'sg-rj93y8iuj33uosositpw'] decoded_output = json.loads( output) # convert json format to python structure # does the list contain our requested security group name? if (args.nsg_name in decoded_output): # yes it does, now go back and find the index into the list of names # then go back and pull the json record for that idx and filter it # for the SecurityGroupId id. idx = 0 for item in decoded_output: if (unicode(args.nsg_name) == item): # print "List contains SG name \"%s\" at index %d" % (args.nsg_name, idx) cmd = 'aliyuncli ecs DescribeSecurityGroups' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --PageSize 50" # default is 10, max is 50 cmd += " --output json" cmd += " --filter SecurityGroups.SecurityGroup[" cmd += str(idx) # index to string cmd += "].SecurityGroupId" retcode, output, errval = self.DoCmd( cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems describing security groups") return False trace(3, output) # existing Security group ID is saved in the args structure # # just to make it more of a pain because it's not hard enough # it's necessary to remove the surrounding qoute charaters from # the group id here args.nsg_id = (output.replace( '"', '')).strip() # remove surrounding qoutes # use strip() to remove newline trace(2, "args.nsg_id: \"%s\"" % args.nsg_id) return 0 idx = idx + 1 # returns 1 if did not find security group trace(2, "Did not find security group: \"%s\"" % args.nsg_name) return 1
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 # make sure our persistant IP address is clear args.vm_ip = "" # public ssh key file, builds path from options, checks existance # this sets args.key_file to "keyfile.pub" (better known as "id_rsa.pub") retcode = self.CheckSSHKeyFilePath(args, ".pub") if (retcode != 0): return (retcode) keyfile_pub = args.key_file # print "keyfile_pub:%s" % keyfile_pub # however other than in the createVM, the private Key file # is required for all the local ssh'ing that we will be doing retcode = self.CheckSSHKeyFilePath(args, "") if (retcode != 0): return (retcode) # ssh key file, builds path from options, checks existance # metadata consists of user name, and the "ssh key" file # # Note that where we pass azure the name of our public ssh key, # with Google the entire public key string is passsed in the metadata # # Example: # metadata = "ssh-keys=newtonl:ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbzMfRh2nXbcwqqVjGvMgOqD3FyJHk4hGdXofLfBAsfQtZQbUg208yWqPEdFgPVyw8zwhd2WAEnaRSK6TmNOok5qgCydpjxbqoCNIfdhfOSFl+T6veiibzQ2UyWolxNPaQ4IPE4FdQsNDM37lsQNCFyZfBaqfbTSmDi5W8Odoqf7E2tfXcLD4gsFpexM4bgK43aaOCp/ekCiJi+Y13MJTw5VmLIdLgJZ/40oMRpK6nZcipbkHkVQEV9mLpTKDLG/xvb7gRzFiXbp4qgF9dWQKqIkfL4UNpcKTjYXqmdt2okoeDGVhQ0AnVM1pHKIyVulV5c17jz7wyj+0UaizAFvSh [email protected]" # # Note: The first few characters of the id_rsa.pub file is "ssh-rsa AAAAB3..." # don't need to explicitly pass in "ssh-rsa" here. Don't over complicate it # with open(keyfile_pub, "r") as f: ssh_rsa_data = f.read() metadata = "ssh-keys=%s:%s" % (args.user, ssh_rsa_data) # with Google, don't need to create a network security group. # mostly inherit defaults from the main scription # neat thing with Google, is that we can specify GPU's at VM init time # with other CSPs, number/type of GPU's is a function of the "instance_type" accelerator_count = 0 # used for delay before ping below if (args.accelerator_type != None and args.accelerator_type != "" and args.accelerator_type != "None" and args.accelerator_count > 0): accelerator = "%s,count=%d" % (args.accelerator_type, args.accelerator_count) accelerator_count = args.accelerator_count # if adding GPUs, add additional info to the VM name # # Google GPU 'accelerator' types are of form: nvidia-tesla-p100 - too long for VM name which is # limited to 61 chars - so strip of last what's after last '-' as name # # Remember with google, names must all be lowercase numbers/letters if (args.vm_name.find("gpu") == -1): # haven't added "gpu" yet type = args.accelerator_type[args.accelerator_type.rfind("-") + 1:] args.vm_name += "-%dx%sgpu" % (args.accelerator_count, type) else: accelerator = None # don't assign gpus # Create the VM # NOTE: with gcp, it's not necessary to assign it Network Security Groups # when creating the VM's -- Called "network firewall rules", they are # added later after the VM is created. self.Inform("CreateVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " --project \"%s\" " % args.project # "my-project" cmd += "instances create \"%s\"" % args.vm_name # "pbradstr-Fri-2018Mar02-181931" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # reduces noize output cmd += " --machine-type \"%s\"" % args.instance_type # "n1-standard-1" cmd += " --subnet \"%s\"" % args.subnet # default cmd += " --metadata \"%s\"" % metadata cmd += " --maintenance-policy \"%s\"" % args.maintenance_policy # "TERMINATE" cmd += " --service-account \"%s\"" % args.service_account # "*****@*****.**" # cmd += " --scopes %s" % args.scopes # https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring.write","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ if (accelerator != None): # optional if we want GPUs cmd += " --accelerator type=%s" % accelerator # nvidia-tesla-p100,count=1" cmd += " --min-cpu-platform \"%s\"" % args.min_cpu_platform # "Automatic" cmd += " --image \"%s\"" % args.image_name # "nvidia-gpu-cloud-image-20180227" cmd += " --image-project \"%s\"" % args.image_project # "nvidia-ngc-public" cmd += " --boot-disk-size %d" % args.boot_disk_size # 32, in GB cmd += " --boot-disk-type \"%s\"" % args.boot_disk_type # "pd-standard" cmd += " --boot-disk-device-name \"%s\"" % args.vm_name # assume same as VM name # To break big command into individual options per line for debugging # echo $V | sed -e $'s/ --/\\\n --/g' # execute the command rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return rc # Get the returend information, pull out the vmID and (if possible) # the public IP address of the VM # # NOTE: with gcp, IP address is assigned in output from 'create' commmand # don't need to poll for it (we waited for command to complete instead) decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # FYI: reason why [0] is user here is that json output format could # possibly supply more than one instance of data. Since our request # is specific to one instance, the [0] grouping is kind of redundant args.vm_id = decoded_output[0][ 'id'] # may not actually need the ID, all vm_name based args.vm_ip = decoded_output[0]['networkInterfaces'][0][ 'accessConfigs'][0]['natIP'] # save vm ID and other fields setup here so don't use them if error later # actually don't care if it's fully running, (that would be nice) but # need to save the VM id here since we need to delete it in any case self.ArgSaveToFile(args) # Google has a habbit of reusing the IP addresses, way more than any other # csp that I've tested. But since this is an old IP with a new VM, if that # IP exists in the known_hosts file, it's going to cause problems when # we try to ssh into it (as will happen right away with "WaitTillRunning" # Blow away value in known-hosts now. Note that it's also removed when # the VM is deleted... but done here on create if forgot or removed some # other way. (TODO: This step needed on other CSPs ? ) self.DeleteIPFromSSHKnownHostsFile(args) # quick sanity check -- verify the name returned from the create command # is the same as we were given returned_name = decoded_output[0]["name"] # print("name:%s" % returned_name) if (decoded_output[0]["name"] != args.vm_name): error( "sanity check: vm name returned \"%s\" != vm_name \"%s\" given to create command" % (returned_name, args.vm_name)) json.dumps(decoded_output, indent=4, sort_keys=True) return 1 # Seeing an error here on gcloud only where # # 1) VM is up in gcloud web page, and can ssh into it there from the web page # 2) the first ping in WaitTillRunning succeeds # 3) the ssh in WaitTillRunning fails with a timeout # 4) any further ping or ssh fails # 5) see #1 # # A delay before the first ping seems to workaround the problem # 5 seconds is not enough, got 30% error rates. 10 seconds seems # to work at least with"n1-standard-1" instances and no gpus # # Adding and additional 10 seconds per GPU. Emperical value # delay = 10 + (accelerator_count * 10) debug( 0, "WORKAROUND: external network connect - sleep for %d seconds before ping" % (delay)) time.sleep(delay) # wait a few seconds before ANY command to vm # Another sanity check -- gcp will return from create only once the # vm is up and running. This code here (which comes from aws implementation) # wait's till we can ping and ssh into the VM. It should take little # time here with gcp, but on the other hand it's a good confidence booster # to know that we have checked and hav verified that can ping and ssh into # the vm. if (rc == 0): rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it debug(2, "createVM returning %d" % rc) return rc # 0: succcess, 1: failure
def DeleteSecurityGroup(self, args): ''' deletes the security group ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) error("gcp (google cloud) does not use network security groups") return 1
def RestartVM(self, args): # also known as 'reboot' on aws ''' Restarts the VM ''' # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): return 1 # can only restart a VM if it's currently running. # This "running" string may be CSP specific retcode = self.CheckRunStatus(args, "RUNNING") # running if (retcode != 0): error("Not running") return retcode # Restart the VM self.Inform("RestartVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances start" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # this code is CSP specific. # # on aws after "reset", the status never becomes "un-running" # anytime durring the reset procss -- so we check when it FAILS # to ping to know if estart actually occured. Then we simply wait # till it's back up again - pingable and ssh-able to know it's # running # # Ability to ping the VM is also CSP specific, and is normally # setup in the Network Security Group as a specific rule. if (retcode == 0): if (args.pingable == 1): rc = self.WaitForPing(args, False, TIMEOUT_2) print "Saw Pingable rc=%d" % rc else: time.sleep(5) # let VM go down enough so SSH stops (we hope) rc = 0 # fake success, since ping isn't supported if (rc != 0): error("never went un-pingable. Did VM restart?") else: rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # running # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it return rc # 0: succcess, 1: failure