def GetImageId(self, args): # if already have the ID, can skip this step. Note "None" as string from args file if (args.image_id != "" and args.image_id != None and args.image_id != "None"): return 0 # query name, to return id cmd = "aliyuncli ecs DescribeImages" cmd += " --RegionId %s" % args.region cmd += " --ImageName \"%s\"" % args.image_name cmd += " --ImageOwnerAlias %s" % args.image_owner_alias retcode, output, errval = self.DoCmd(cmd) if (retcode != 0): error(errval) return 1 # decode the JSON output decoded_output = json.loads(output) trace(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.image_id = decoded_output['Images']['Image'][0]['ImageId'] return 0
def GetRunStatus(self, args): ''' Returns running-state of instance from describe-instance-status ''' if (self.CheckID(args) == False): return 1 cmd = "gcloud --format=\"json\" beta compute" cmd += " instances describe" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note gclould takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems describe VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) run_state = decoded_output['status'] # returns something like "RUNNING" or "STOPPED" self.Inform(run_state) return (run_state)
def GetIPSetupCorrectly(self, args): ''' called after 'running' status to get IP. Does nothing for Google ''' # With google, it looks like the IP address gets changed when restarting # from 'stop'. -- SO we must clear it in our stop command ! # # If we don't have IP run "describe" and get it. # If we have it, simply return it if (args.vm_ip != ""): # this ip value should have been set in Create # print "GetIPSetupCorrectly: already have ip:%s" % args.vm_ip return 0 # so we don't need to get it # don't have IP value, hopefully VM is in running state and will # have a IP that we can get cmd = "gcloud --format=\"json\" beta compute" cmd += " instances describe" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems describe VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # ip value that was all that was really needed args.vm_ip = decoded_output['networkInterfaces'][0]['accessConfigs'][ 0]['natIP'] # sanity -- is VM id returned same as what we got from Create? # got the value for free, might as well check it vm_id = decoded_output['id'] if (vm_id != args.vm_id): error("Sanity - Returned vm_id:%s != vm_id value from create: %s" % (vm_id, args.vm_id)) return 1 # check status -- we should be RUNNING status = decoded_output['status'] if (status != "RUNNING"): error("Shouldn't we be RUNNING? -- current status is \"$status\"") return (0)
def DeleteSecurityGroup(self, args): ''' deletes the security group ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) if (args.nsg_id == None): error("NSG %s already deleted", args.nsg_name) return (1) cmd = "aws ec2 delete-security-group" cmd += " --group-id %s" % args.nsg_id retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode args.nsg_id = None # remove id from args return (0)
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does''' trace(2, "\"%s\"" % (args.nsg_name)) if (args.nsg_name == "" or args.nsg_name == None or args.nsg_name == "None"): error("NetworkSecurityGroup name is \"%s\"" % args.nsg_name) return 1 # Is there a better way to do this than to pull in the entire dictionary # and iterate through the keys? cmd = "aws ec2 describe-security-groups " # build the AWS command to create an instance cmd += " --region %s" % args.region # us-west-2 retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 decoded_output = json.loads(output) # number of security groups items = len( decoded_output["SecurityGroups"]) # number of security groups # slow search for name for idx in range(0, items): if (decoded_output["SecurityGroups"][idx]["GroupName"] == args.nsg_name): args.nsg_id = decoded_output["SecurityGroups"][idx]["GroupId"] debug( 2, "%2d %-12s \"%s\"" % (idx, decoded_output["SecurityGroups"][idx]["GroupId"], decoded_output["SecurityGroups"][idx]["GroupName"])) return 0 # found it # returns 1 if did not find security group trace(2, "Did not find security group: \"%s\"" % args.nsg_name) return 1
def DeleteSecurityGroup(self, args): ''' deletes the security group ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) for retrycnt in range(0, 5): # deleting right after deleteVM errors self.Inform("DeleteNSG") cmd = 'aliyuncli ecs DeleteSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % args.nsg_id # "sg-rj999tz2kpxehy7obsjn" cmd += " 2> /dev/null" # don't show errors retcode, output, errval = self.DoCmdNoError( cmd) # call the Alibaba command, ignore error if (retcode == 0): # check for error code args.nsg_id = "" # clear out the id break trace( 3 - retrycnt, "Problems deleting security group \"%s\" retry:%d" % (args.nsg_name, retrycnt)) time.sleep(retrycnt) return retcode
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 args.vm_ip = "" # make sure IP address is clear # ssh key file, builds path from options, checks existance retcode = self.CheckSSHKeyFilePath(args, ".pem") if (retcode != 0): return (retcode) # security group, create if neeeded, does nothing if already exists # consider moving this step outside this VM create so that better # reflects real VM timing? self.Inform("CreateNSG") if (self.CreateNSG(args) != 0): # sets args.nsg_id return 1 trace(2, "nsg_id: \"%s\" %s" % (args.nsg_name, args.nsg_id)) # look up image-name, return region specific image id self.Inform("GetImageId") if (self.GetImageId(args) != 0): return 1 trace(2, "image_id: \"%s\" %s" % (args.image_name, args.image_id)) # with security group and image id, we can now create the instance self.Inform("run-instances") cmd = "aws ec2 run-instances" # build the AWS command to create an instance cmd += " --image-id %s" % args.image_id # aws image identifer via self.GetImageid() cmd += " --instance-type %s" % args.instance_type # t2.micro cmd += " --region %s" % args.region # us-west-2 cmd += " --key-name %s" % args.key_name # my-security-key cmd += " --security-group-ids %s" % args.nsg_id # Security Group retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return 1 # nothing to delete, can return # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_id = decoded_output['Instances'][0]['InstanceId'] args.vm_ip = "" # don't have IP we see it running # Name your instance! . Done here instead of in run-instances call # it's tricky in bash to get space/qoutes right, at least in original bash code where # this was orginally written.. :-) self.Inform("create-tags") cmd = "aws ec2 create-tags" cmd += " --resource %s" % args.vm_id cmd += " --tags Key=Name,Value=%s" % args.vm_name # unique time-stamped name retcode, output, errval = self.DoCmd(cmd) # wait till the instance is up and running, pingable and ssh-able if (retcode == 0): retcode = self.WaitTillRunning(args, "running", TIMEOUT_1) # save vm ID and other fields setup here so don't use them if error later self.ArgSaveToFile(args) debug(2, "createVM returning %d" % retcode) return retcode # 0: succcess, 1: failure
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) # Get the users VPC id if we don't have it if (args.vpcid == "" or args.vpcid == None or args.vpcid == "None"): cmd = "aws ec2 describe-vpcs" cmd += " --region %s" % args.region retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): return retcode decoded_output = json.loads(output) debug(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vpcid = decoded_output["Vpcs"][0]["VpcId"] debug(1, "args.vpcid <--- %s" % args.vpcid) # create the security group, with a meaningful description desc = "NSG Generated for %s" % args.vm_name cmd = "aws ec2 create-security-group" cmd += " --group-name %s" % args.nsg_name cmd += " --description \"%s\"" % desc cmd += " --vpc-id %s" % args.vpcid cmd += " --region %s" % args.region retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode # get the groupid of the new security group decoded_output = json.loads(output) debug(2, json.dumps(decoded_output, indent=4, sort_keys=True)) args.nsg_id = decoded_output["GroupId"] debug(1, "args.nsg_id <--- %s" % args.nsg_id) # tag new group with our group name cmd = "aws ec2 create-tags" cmd += " --resource %s" % args.nsg_id cmd += " --tags Key=Name,Value=%s" % args.nsg_name retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): # check for return code return retcode # Security rules -- make a list of ingress and outgress rules - easy to change # slow, but this code is rarely used. understandability is more important ingress = {} ingress[0] = { "IpProtocol": "tcp", "ToPort": 22, "FromPort": 22, "CidrIp": "0.0.0.0/0", "Description": "For SSH" } ingress[1] = { "IpProtocol": "tcp", "ToPort": 443, "FromPort": 443, "CidrIp": "0.0.0.0/0", "Description": "For SSL" } ingress[2] = { "IpProtocol": "tcp", "ToPort": 5000, "FromPort": 5000, "CidrIp": "0.0.0.0/0", "Description": "For NVIDIA DIGITS6" } ingress[3] = { "IpProtocol": "icmp", "ToPort": -1, "FromPort": 8, "CidrIp": "0.0.0.0/0", "Description": "To allow to be pinged" } egress = {} outer_retcode = 0 for idx in range(0, len(ingress)): self.Inform("CreateNSG rule %s.%s" % args.nsg_name, ingress[idx]["Name"]) cmd = "aws ec2 authorize-security-group-ingress" cmd += " --group-id %s" % args.nsg_id cmd += " --ip-permissions '[{" # mini-embedded json like cmd += " \"IpProtocol\":\"%s\"," % ingress[idx]["IpProtocol"] cmd += " \"ToPort\":%s," % ingress[idx][ "ToPort"] # KEEP 'To' before 'From' - no effect for tcp, but cmd += " \"FromPort\":%s," % ingress[idx][ "FromPort"] # required for how Wildcard ICMP type is defined cmd += " \"IpRanges\": [{" cmd += " \"CidrIp\":\"%s\"," % ingress[idx]["CidrIp"] cmd += " \"Description\":\"%s\"" % ingress[idx]["Description"] cmd += " }]" cmd += " }]'" retcode, output, errval = self.DoCmd(cmd) # call the AWS command if (retcode != 0): outer_retcode = retcode # keep any non-zero return code # egress rules -- as of 1/2018 there arn't any... return outer_retcode
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 args.vm_ip = "" # make sure IP address is clear # ssh key file, builds path from options, checks existance retcode = self.CheckSSHKeyFilePath(args, ".pem") if (retcode != 0): return (retcode) # security group, create if neeeded, does nothing if already exists # should move this step outside this VM create so that better reflects # real VM timing? retcode = self.CreateNSG(args) # sets args.nsg_id if (retcode != 0): return (retcode) trace(2, "nsg_id: \"%s\" %s" % (args.nsg_name, args.nsg_id)) # look up image-name, return region specific image id # TODO: saw this 'aliyuncli ecs describe-images' fail with network error # check if connection to Alibaba is working before calling this self.Inform("GetImageId") if (self.GetImageId(args) != 0): return 1 trace(2, "image_id: \"%s\" %s" % (args.image_name, args.image_id)) # with security group and image id, we can now create the instance self.Inform("CreateInstance") cmd = 'aliyuncli ecs CreateInstance' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --ImageId %s" % args.image_id # m-rj9gjqbdwtwlhtgqjeov" cmd += " --SecurityGroupId %s" % args.nsg_id # sg-rj999tz2kpxehy7obsjn" cmd += " --InstanceType %s" % args.instance_type # ecs.gn5-c4g1.xlarge cmd += " --InstanceName %s" % args.vm_name # Name to create VM: "newton-gn5-1gpu" cmd += " --InternetMaxBandwidthOut %d" % args.bandwidth_out # 10 cmd += " --InstanceChargeType %s" % args.charge_type # PostPaid cmd += " --KeyPairName %s" % args.key_name # baseos-alibaba-siliconvalley retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return 1 # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_id = decoded_output['InstanceId'] # with Alibaba, Instances created via CLI are not automatically given a public IP address. # To assign a public IP address to the instance you just created # note -- this may not work immediatly after creating VM. try a few times args.vm_ip = "" for retrycnt in range(0, 4): self.Inform("AllocatePublicIpAddress") cmd = 'aliyuncli ecs AllocatePublicIpAddress' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --InstanceId %s" % args.vm_id # i-rj9a0iw25hryafj0fm4v cmd += " 2> /dev/null" # don't show errors (the timeout) retcode, output, errval = self.DoCmdNoError( cmd) # call the Alibaba command, no errors if (retcode == 0): # check for error code decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) args.vm_ip = decoded_output['IpAddress'] break # got IP we think -- done now trace( 3 - retrycnt, "Problems allocating IP address for %s, retry:%d" % (args.vm_id, retrycnt)) time.sleep(retrycnt) if (args.vm_ip == ""): error("Unable to allocating IP address for \"%s\"" % args.vm_name) return 1 # print "args.vm_ip: %s" % args.vm_ip # save vm ID and other fields setup here so don't use them if error later # do this again later when we are fully started self.ArgSaveToFile(args) # unlike Alibaba or azure, alibaba does not automaticly start an instance # when it is created. Start it here to be consistent retcode = self.StartVM(args) return 0
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) # create security group cmd = 'aliyuncli ecs CreateSecurityGroup' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --SecurityGroupName \"%s\"" % args.nsg_name # "NvidiaSG" retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems creating security group") return 1 # decode the JSON output decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) security_group_id = decoded_output['SecurityGroupId'] # new Security group ID is saved in the args structure args.nsg_id = security_group_id # A new security group will not have any rules in it. # The following commands will open inbound ports 22 (for SSH), # 443 (for HTTPS), and 5000 (for DIGITS6): cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 22/22 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description SSH' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 443/443 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description HTTPS' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol tcp --PortRange 5000/5000 --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description DIGITS6' self.DoCmd(cmd) cmd = 'aliyuncli ecs AuthorizeSecurityGroup' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol icmp --PortRange -1/-1' # Is value Ok? (-1/8 for Alibaba?) cmd += ' --SourceCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description \"Support for ping\"' self.DoCmd(cmd) # The following command will open all outbound ports: cmd = 'aliyuncli ecs AuthorizeSecurityGroupEgress' cmd += ' --RegionId %s' % args.region # us-west-1 cmd += ' --SecurityGroupId %s' % security_group_id # "sg-rj999tz2kpxehy7obsjn" cmd += ' --IpProtocol all --PortRange -1/-1 --DestCidrIp 0.0.0.0/0' cmd += ' --Policy accept --Description \"All open!\"' retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems setting up security group rules") return 1 return 0 # happy return
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does ''' trace(2, "\"%s\"" % (args.nsg_name)) if (args.nsg_name == "" or args.nsg_name == None or args.nsg_name == "None"): error("NetworkSecurityGroup name is \"%s\"" % args.nsg_name) return 1 # can it be found by name? -- get list of all names first cmd = 'aliyuncli ecs DescribeSecurityGroups' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --PageSize 50" # default is 10, max is 50 cmd += " --output json" cmd += " --filter SecurityGroups.SecurityGroup[].SecurityGroupName" retcode, output, errval = self.DoCmd(cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems describing security groups") return 1 # returns a Json object like: # [ # "NexposeSG", # "NewtonSG", # "sg-rj93y8iuj33uosositpw" # ] # # Use json converter to make it into a list # [u'NexposeSG', u'NewtonSG', u'sg-rj93y8iuj33uosositpw'] decoded_output = json.loads( output) # convert json format to python structure # does the list contain our requested security group name? if (args.nsg_name in decoded_output): # yes it does, now go back and find the index into the list of names # then go back and pull the json record for that idx and filter it # for the SecurityGroupId id. idx = 0 for item in decoded_output: if (unicode(args.nsg_name) == item): # print "List contains SG name \"%s\" at index %d" % (args.nsg_name, idx) cmd = 'aliyuncli ecs DescribeSecurityGroups' cmd += " --RegionId %s" % args.region # us-west-1 cmd += " --PageSize 50" # default is 10, max is 50 cmd += " --output json" cmd += " --filter SecurityGroups.SecurityGroup[" cmd += str(idx) # index to string cmd += "].SecurityGroupId" retcode, output, errval = self.DoCmd( cmd) # call the Alibaba command if (retcode != 0): # check for return code error("Problems describing security groups") return False trace(3, output) # existing Security group ID is saved in the args structure # # just to make it more of a pain because it's not hard enough # it's necessary to remove the surrounding qoute charaters from # the group id here args.nsg_id = (output.replace( '"', '')).strip() # remove surrounding qoutes # use strip() to remove newline trace(2, "args.nsg_id: \"%s\"" % args.nsg_id) return 0 idx = idx + 1 # returns 1 if did not find security group trace(2, "Did not find security group: \"%s\"" % args.nsg_name) return 1
def StopVM(self, args): ''' Stop the VM ''' # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): return 1 # Checks status. Note that "running" string may be CSP specific retcode = self.CheckRunStatus(args, "RUNNING") # running if (retcode != 0): error("Not running") return retcode # Stop the VM self.Inform("StopVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances stop" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # The CSP may return from the above command once the request # for stopping has been received. However we don't want to # return from this function until we are actually positive that # the VM has compleatly stopped. This check will be CSP specific if (rc == 0): # make sure our persistant IP address is clear - # google changes IP address after stop. So make sure # the next time we need it, we go and ask for it args.vm_ip = "" # get status status = self.GetRunStatus(args) # CSP specific.. # The instance becomes "stopping" after a successful API request, # and the instance becomes "stopped" after it is stopped successfully. if (status != "TERMINATED"): # "stopping" - transiant state error("Asked VM to stop, but status = \"%s\"" % (status)) rc = 1 else: rc = self.WaitForRunStatus(args, "TERMINATED", TIMEOUT_2) # stopped # return 0 only when the VM is fully stopped return rc # 0: succcess, 1: failure
def RestartVM(self, args): # also known as 'reboot' on aws ''' Restarts the VM ''' # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): return 1 # can only restart a VM if it's currently running. # This "running" string may be CSP specific retcode = self.CheckRunStatus(args, "RUNNING") # running if (retcode != 0): error("Not running") return retcode # Restart the VM self.Inform("RestartVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances start" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # this code is CSP specific. # # on aws after "reset", the status never becomes "un-running" # anytime durring the reset procss -- so we check when it FAILS # to ping to know if estart actually occured. Then we simply wait # till it's back up again - pingable and ssh-able to know it's # running # # Ability to ping the VM is also CSP specific, and is normally # setup in the Network Security Group as a specific rule. if (retcode == 0): if (args.pingable == 1): rc = self.WaitForPing(args, False, TIMEOUT_2) print "Saw Pingable rc=%d" % rc else: time.sleep(5) # let VM go down enough so SSH stops (we hope) rc = 0 # fake success, since ping isn't supported if (rc != 0): error("never went un-pingable. Did VM restart?") else: rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # running # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it return rc # 0: succcess, 1: failure
def StartVM(self, args): ''' Starts the VM ''' rc = 1 # assume error # check for a valid VM id, returns if it's not set, indicating that # either a VM hasn't been created, or it was deleted. if (self.CheckID(args) == False): # checks for a valid VM id return 1 # # Get run status and check current state # The strings being checked here may be CSP specific. status = self.GetRunStatus(args) if (status == "RUNNING"): return 0 # already running, simply return elif (status == "stopping"): buf = "%s is in %s state, can't start running now" % (args.vm_id, status) error(buf) elif (status == "TERMINATED" or status == "null"): rc = 0 # ok to proceed else: buf = "id %s is in \"%s\" state, not sure can start running" % ( args.vm_id, status) error(buf) if (rc != 0): return rc # unexpected status # start the VM self.Inform("StartVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " instances start" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # 'quiet' prevents prompting "do you want to delete y/n?" cmd += " \"%s\" " % args.vm_name # note takes VM Name, not a uuid as with aws/azure.. rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems deleting VM \"%s\"" % args.vm_name) return rc decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # CSP specific - verify that the VM is fully up and running, and that # we have it's IP address and can ssh into it. # # Some CSP's may return from their StartVM in this state, so this call # is optional if (rc == 0): rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # running # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it return rc # 0: succcess, 1: failure
def CreateVM(self, args): ''' Creates a new VM. 'args' holds parameters ''' if (args.vm_id != "None" and args.vm_id != None): error( "Instance \"%s\" already exists, run 'deleteVM' first, or 'clean' if stale arg list" % args.vm_id) return 1 # make sure our persistant IP address is clear args.vm_ip = "" # public ssh key file, builds path from options, checks existance # this sets args.key_file to "keyfile.pub" (better known as "id_rsa.pub") retcode = self.CheckSSHKeyFilePath(args, ".pub") if (retcode != 0): return (retcode) keyfile_pub = args.key_file # print "keyfile_pub:%s" % keyfile_pub # however other than in the createVM, the private Key file # is required for all the local ssh'ing that we will be doing retcode = self.CheckSSHKeyFilePath(args, "") if (retcode != 0): return (retcode) # ssh key file, builds path from options, checks existance # metadata consists of user name, and the "ssh key" file # # Note that where we pass azure the name of our public ssh key, # with Google the entire public key string is passsed in the metadata # # Example: # metadata = "ssh-keys=newtonl:ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbzMfRh2nXbcwqqVjGvMgOqD3FyJHk4hGdXofLfBAsfQtZQbUg208yWqPEdFgPVyw8zwhd2WAEnaRSK6TmNOok5qgCydpjxbqoCNIfdhfOSFl+T6veiibzQ2UyWolxNPaQ4IPE4FdQsNDM37lsQNCFyZfBaqfbTSmDi5W8Odoqf7E2tfXcLD4gsFpexM4bgK43aaOCp/ekCiJi+Y13MJTw5VmLIdLgJZ/40oMRpK6nZcipbkHkVQEV9mLpTKDLG/xvb7gRzFiXbp4qgF9dWQKqIkfL4UNpcKTjYXqmdt2okoeDGVhQ0AnVM1pHKIyVulV5c17jz7wyj+0UaizAFvSh [email protected]" # # Note: The first few characters of the id_rsa.pub file is "ssh-rsa AAAAB3..." # don't need to explicitly pass in "ssh-rsa" here. Don't over complicate it # with open(keyfile_pub, "r") as f: ssh_rsa_data = f.read() metadata = "ssh-keys=%s:%s" % (args.user, ssh_rsa_data) # with Google, don't need to create a network security group. # mostly inherit defaults from the main scription # neat thing with Google, is that we can specify GPU's at VM init time # with other CSPs, number/type of GPU's is a function of the "instance_type" accelerator_count = 0 # used for delay before ping below if (args.accelerator_type != None and args.accelerator_type != "" and args.accelerator_type != "None" and args.accelerator_count > 0): accelerator = "%s,count=%d" % (args.accelerator_type, args.accelerator_count) accelerator_count = args.accelerator_count # if adding GPUs, add additional info to the VM name # # Google GPU 'accelerator' types are of form: nvidia-tesla-p100 - too long for VM name which is # limited to 61 chars - so strip of last what's after last '-' as name # # Remember with google, names must all be lowercase numbers/letters if (args.vm_name.find("gpu") == -1): # haven't added "gpu" yet type = args.accelerator_type[args.accelerator_type.rfind("-") + 1:] args.vm_name += "-%dx%sgpu" % (args.accelerator_count, type) else: accelerator = None # don't assign gpus # Create the VM # NOTE: with gcp, it's not necessary to assign it Network Security Groups # when creating the VM's -- Called "network firewall rules", they are # added later after the VM is created. self.Inform("CreateVM") cmd = "gcloud --format=\"json\" beta compute" cmd += " --project \"%s\" " % args.project # "my-project" cmd += "instances create \"%s\"" % args.vm_name # "pbradstr-Fri-2018Mar02-181931" cmd += " --zone \"%s\"" % args.region # "us-west1-b" cmd += " --quiet" # reduces noize output cmd += " --machine-type \"%s\"" % args.instance_type # "n1-standard-1" cmd += " --subnet \"%s\"" % args.subnet # default cmd += " --metadata \"%s\"" % metadata cmd += " --maintenance-policy \"%s\"" % args.maintenance_policy # "TERMINATE" cmd += " --service-account \"%s\"" % args.service_account # "*****@*****.**" # cmd += " --scopes %s" % args.scopes # https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring.write","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ if (accelerator != None): # optional if we want GPUs cmd += " --accelerator type=%s" % accelerator # nvidia-tesla-p100,count=1" cmd += " --min-cpu-platform \"%s\"" % args.min_cpu_platform # "Automatic" cmd += " --image \"%s\"" % args.image_name # "nvidia-gpu-cloud-image-20180227" cmd += " --image-project \"%s\"" % args.image_project # "nvidia-ngc-public" cmd += " --boot-disk-size %d" % args.boot_disk_size # 32, in GB cmd += " --boot-disk-type \"%s\"" % args.boot_disk_type # "pd-standard" cmd += " --boot-disk-device-name \"%s\"" % args.vm_name # assume same as VM name # To break big command into individual options per line for debugging # echo $V | sed -e $'s/ --/\\\n --/g' # execute the command rc, output, errval = self.DoCmd(cmd) if (rc != 0): # check for return code error("Problems creating VM \"%s\"" % args.vm_name) return rc # Get the returend information, pull out the vmID and (if possible) # the public IP address of the VM # # NOTE: with gcp, IP address is assigned in output from 'create' commmand # don't need to poll for it (we waited for command to complete instead) decoded_output = json.loads( output) # convert json format to python structure trace(3, json.dumps(decoded_output, indent=4, sort_keys=True)) # FYI: reason why [0] is user here is that json output format could # possibly supply more than one instance of data. Since our request # is specific to one instance, the [0] grouping is kind of redundant args.vm_id = decoded_output[0][ 'id'] # may not actually need the ID, all vm_name based args.vm_ip = decoded_output[0]['networkInterfaces'][0][ 'accessConfigs'][0]['natIP'] # save vm ID and other fields setup here so don't use them if error later # actually don't care if it's fully running, (that would be nice) but # need to save the VM id here since we need to delete it in any case self.ArgSaveToFile(args) # Google has a habbit of reusing the IP addresses, way more than any other # csp that I've tested. But since this is an old IP with a new VM, if that # IP exists in the known_hosts file, it's going to cause problems when # we try to ssh into it (as will happen right away with "WaitTillRunning" # Blow away value in known-hosts now. Note that it's also removed when # the VM is deleted... but done here on create if forgot or removed some # other way. (TODO: This step needed on other CSPs ? ) self.DeleteIPFromSSHKnownHostsFile(args) # quick sanity check -- verify the name returned from the create command # is the same as we were given returned_name = decoded_output[0]["name"] # print("name:%s" % returned_name) if (decoded_output[0]["name"] != args.vm_name): error( "sanity check: vm name returned \"%s\" != vm_name \"%s\" given to create command" % (returned_name, args.vm_name)) json.dumps(decoded_output, indent=4, sort_keys=True) return 1 # Seeing an error here on gcloud only where # # 1) VM is up in gcloud web page, and can ssh into it there from the web page # 2) the first ping in WaitTillRunning succeeds # 3) the ssh in WaitTillRunning fails with a timeout # 4) any further ping or ssh fails # 5) see #1 # # A delay before the first ping seems to workaround the problem # 5 seconds is not enough, got 30% error rates. 10 seconds seems # to work at least with"n1-standard-1" instances and no gpus # # Adding and additional 10 seconds per GPU. Emperical value # delay = 10 + (accelerator_count * 10) debug( 0, "WORKAROUND: external network connect - sleep for %d seconds before ping" % (delay)) time.sleep(delay) # wait a few seconds before ANY command to vm # Another sanity check -- gcp will return from create only once the # vm is up and running. This code here (which comes from aws implementation) # wait's till we can ping and ssh into the VM. It should take little # time here with gcp, but on the other hand it's a good confidence booster # to know that we have checked and hav verified that can ping and ssh into # the vm. if (rc == 0): rc = self.WaitTillRunning(args, "RUNNING", TIMEOUT_1) # returns 0 only if VM is fully up and running, we have it's public IP # and can ssh into it debug(2, "createVM returning %d" % rc) return rc # 0: succcess, 1: failure
def DeleteSecurityGroup(self, args): ''' deletes the security group ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) error("gcp (google cloud) does not use network security groups") return 1
def CreateSecurityGroup(self, args): ''' creates security group. saves it in args.nsg_id ''' trace(2, "\"%s\" %s" % (args.nsg_name, args.nsg_id)) error("gcp (google cloud) does not use network security groups") return 1
def ExistingSecurityGroup(self, args): ''' Does the security group name currently exist ? get it if it does''' trace(2, "\"%s\"" % (args.nsg_name)) error("gcp (google cloud) does not use network security groups") return 0