def run(job_name, num_worker, max_worker, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test='0', run='0', profile='0'): job = resource_acquisition.ResourceManager() # job.create_instance(COMPUTE, PROJECTNAME, zone, name, False, '4', True) job.acquire_resource(job_name+'-'+num_worker+'-'+max_worker, 1, 4, int(num_worker), 100) if num_worker == max_worker: set_slot = '0' else: set_slot = '1' worker_temp = str(int(num_worker) - 1) subprocess.call( ["./start_one_time_training.sh", job_name, '1', worker_temp, '1', 'gs://shijian-18-ml', model, hparam_set, problem, train_steps, ckpt_frequency, '0', '0', '0', set_slot, max_worker]) # os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/ping.txt ${HOME}/desktop/spotTrain_data/1_29/"+zone+"_"+hparam_set+"_ping.txt") # os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/iperf3.txt ${HOME}/desktop/spotTrain_data/1_29/" + zone + "_" + hparam_set + "_iperf3.txt") if int(num_worker) + 1 < 3: os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-ps-0 -q &") os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-master -q &") for i in range(4): os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-worker-"+str(i) + " -q &") else: os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-ps-0 -q &") os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-master -q &") for i in range(4): os.system( "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-worker-" + str(i) + " -q &") time.sleep(300)
def start_job(self): job = resource_acquisition.ResourceManager() global IS_CREATING global CURRENT_RUN worker_temp = str(int(self.num_worker) - 1) if not self.train_only: server_lists = job.acquire_hetero_resource(self.preemptible, self.job_name, int(self.num_ps), int(self.ps_core_num), int(self.num_worker), self.limit, gpu_array, zone_array) self.add_vm_to_db(server_lists, 'master', 0) for i in range(int(worker_temp)): self.add_vm_to_db(server_lists, 'workers', i) IS_CREATING = False subprocess.call([ "./start_one_time_training.sh", self.job_name, self.num_ps, worker_temp, '1', self.bucket_dir, self.model, self.hparam_set, self.problem, self.train_steps, self.ckpt_frequency, '0', self.run_num, self.profile ]) master_status = self.check_status(self.job_name + '-master', self.zone) if master_status['status'] == 'RUNNING': self.stop_cluster(self.job_name, 'Job completed') return
def main(proj_name, cred_path, job_name, num_ps, ps_core_num, num_worker, num_shard, bucket_dir, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test, profile, limit, setSlot, maxWorker, zone=None, gpu=None, hetero, gpu_array=None, zone_array=None): PROJECTNAME = proj_name if os.path.exists(cred_path): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path else: print("Please provide a valid credential path.") return job = resource_acquisition.ResourceManager() if hetero = '1': server_lists = job.acquire_hetero_resource(proj_name, cred_path, job_name, int(num_ps), int(ps_core_num), int(num_worker), limit, gpu_array, zone_array)
def create_eval(self): job = resource_acquisition.ResourceManager() vm_name = self.job_name + '-evaluator' job.create_instance(COMPUTE, PROJECTNAME, self.eval_zone, vm_name, False, '4', True, 'k80') job.check_instance_status(COMPUTE, PROJECTNAME, self.eval_zone, vm_name) os.system('gcloud compute scp --zone ' + self.eval_zone + ' --recurse start-evaluator.sh ozymandias@' + self.job_name + '-evaluator:~') os.system('gcloud compute ssh ozymandias@' + self.job_name + '-evaluator --zone ' + self.eval_zone + ' -- bash start-evaluator.sh ' + self.job_name + ' &')
def run(self): manager = resource_acquisition.ResourceManager() startNewInstance(self.vmname, 'us-west1-b', 'k80', True, True) ## TODO: Zone and GPU needs to be automated int_ip, ip = manager.check_instance_status(self.compute, self.projectName, 'us-west1-b', self.vmname) subprocess.call([ "/home/ozymandias/proj_code/code/spotTrain/start_sub.sh", self.jobName, self.vmname, str(self.index), 'us-west1-b' ]) return
def main(zone, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test='0', run='0', profile='0'): temp = '' for c in hparam_set: if c == "_": temp += '-' else: temp += c name = zone + '-' + temp job = resource_acquisition.ResourceManager() job.create_instance(COMPUTE, PROJECTNAME, zone, name, False, '4', True) request = COMPUTE.instances().get(project=PROJECTNAME, zone=zone, instance=name) result = request.execute() command = "echo VM READY" username = "******" home_var = os.environ['HOME'] while result['status'] != 'RUNNING': request = COMPUTE.instances().get(project=PROJECTNAME, zone=zone, instance=name) result = request.execute() ip = result['networkInterfaces'][0]['accessConfigs'][0]['natIP'] port = 22 client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.WarningPolicy) while True: try: client.connect(ip, port=port, username=username, password=None, key_filename=home_var + "/.ssh/google_compute_engine") stdin, stdout, stderr = client.exec_command(command) break except (paramiko.ssh_exception.BadHostKeyException, paramiko.ssh_exception.AuthenticationException, paramiko.ssh_exception.SSHException, paramiko.ssh_exception.socket.error) as e: print "Retrying SSH to VM" time.sleep(1) client.close() subprocess.call( ["./start_one_time_training.sh", name, '1', '0', '1', 'gs://shijian-18-ml', model, hparam_set, problem, train_steps, ckpt_frequency, '0', '0', '0']) os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/ping.txt ${HOME}/desktop/spotTrain_data/1_29/"+zone+"_"+hparam_set+"_ping.txt") os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/iperf3.txt ${HOME}/desktop/spotTrain_data/1_29/" + zone + "_" + hparam_set + "_iperf3.txt") os.system("gcloud compute instances delete --zone " + zone + " " + name + " -q &")
def main(job_name, num_ps, ps_core_num, num_worker, num_shard, bucket_dir, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test, profile, limit, setSlot, maxWorker, gpu): job = resource_acquisition.ResourceManager() server_lists = job.acquire_resource(job_name, int(num_ps), int(ps_core_num), int(num_worker), limit, zone='us-west1-b', gpu_type=gpu) worker_temp = str(int(num_worker) - 1) subprocess.call([ "./start_one_time_training.sh", job_name, num_ps, worker_temp, num_shard, bucket_dir, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test, '0', str(profile), str(setSlot), str(maxWorker) ])