Esempio n. 1
0
def run(job_name, num_worker, max_worker, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test='0', run='0', profile='0'):
    job = resource_acquisition.ResourceManager()
    # job.create_instance(COMPUTE, PROJECTNAME, zone, name, False, '4', True)
    job.acquire_resource(job_name+'-'+num_worker+'-'+max_worker, 1, 4, int(num_worker), 100)

    if num_worker == max_worker:
        set_slot = '0'
    else:
        set_slot = '1'

    worker_temp = str(int(num_worker) - 1)

    subprocess.call(
        ["./start_one_time_training.sh", job_name, '1', worker_temp, '1', 'gs://shijian-18-ml', model, hparam_set,
         problem, train_steps, ckpt_frequency, '0', '0', '0', set_slot, max_worker])

    # os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/ping.txt ${HOME}/desktop/spotTrain_data/1_29/"+zone+"_"+hparam_set+"_ping.txt")
    # os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/iperf3.txt ${HOME}/desktop/spotTrain_data/1_29/" + zone + "_" + hparam_set + "_iperf3.txt")
    if int(num_worker) + 1 < 3:
        os.system(
                "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-ps-0 -q &")
        os.system(
                "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-master -q &")
        for i in range(4):
            os.system(
                "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-worker-"+str(i) + " -q &")
    else:
        os.system(
            "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-ps-0 -q &")
        os.system(
            "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-master -q &")
        for i in range(4):
            os.system(
                "gcloud compute instances delete --zone us-east1-c " + job_name+'-'+num_worker+'-'+max_worker + "-worker-" + str(i) + " -q &")
        time.sleep(300)
 def start_job(self):
     job = resource_acquisition.ResourceManager()
     global IS_CREATING
     global CURRENT_RUN
     worker_temp = str(int(self.num_worker) - 1)
     if not self.train_only:
         server_lists = job.acquire_hetero_resource(self.preemptible,
                                                    self.job_name,
                                                    int(self.num_ps),
                                                    int(self.ps_core_num),
                                                    int(self.num_worker),
                                                    self.limit, gpu_array,
                                                    zone_array)
         self.add_vm_to_db(server_lists, 'master', 0)
         for i in range(int(worker_temp)):
             self.add_vm_to_db(server_lists, 'workers', i)
         IS_CREATING = False
     subprocess.call([
         "./start_one_time_training.sh", self.job_name, self.num_ps,
         worker_temp, '1', self.bucket_dir, self.model, self.hparam_set,
         self.problem, self.train_steps, self.ckpt_frequency, '0',
         self.run_num, self.profile
     ])
     master_status = self.check_status(self.job_name + '-master', self.zone)
     if master_status['status'] == 'RUNNING':
         self.stop_cluster(self.job_name, 'Job completed')
     return
Esempio n. 3
0
def main(proj_name, cred_path, job_name, num_ps, ps_core_num, num_worker, num_shard, bucket_dir, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test, profile, limit, setSlot, maxWorker, zone=None, gpu=None, hetero, gpu_array=None, zone_array=None):
    PROJECTNAME = proj_name
    if os.path.exists(cred_path):
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
    else:
        print("Please provide a valid credential path.")
        return
    job = resource_acquisition.ResourceManager()
    if hetero = '1':
        server_lists = job.acquire_hetero_resource(proj_name, cred_path, job_name, int(num_ps), int(ps_core_num), int(num_worker), limit, gpu_array, zone_array)
 def create_eval(self):
     job = resource_acquisition.ResourceManager()
     vm_name = self.job_name + '-evaluator'
     job.create_instance(COMPUTE, PROJECTNAME, self.eval_zone, vm_name,
                         False, '4', True, 'k80')
     job.check_instance_status(COMPUTE, PROJECTNAME, self.eval_zone,
                               vm_name)
     os.system('gcloud compute scp --zone ' + self.eval_zone +
               ' --recurse start-evaluator.sh ozymandias@' + self.job_name +
               '-evaluator:~')
     os.system('gcloud compute ssh ozymandias@' + self.job_name +
               '-evaluator --zone ' + self.eval_zone +
               ' -- bash start-evaluator.sh ' + self.job_name + ' &')
Esempio n. 5
0
 def run(self):
     manager = resource_acquisition.ResourceManager()
     startNewInstance(self.vmname, 'us-west1-b', 'k80', True,
                      True)  ## TODO: Zone and GPU needs to be automated
     int_ip, ip = manager.check_instance_status(self.compute,
                                                self.projectName,
                                                'us-west1-b', self.vmname)
     subprocess.call([
         "/home/ozymandias/proj_code/code/spotTrain/start_sub.sh",
         self.jobName, self.vmname,
         str(self.index), 'us-west1-b'
     ])
     return
Esempio n. 6
0
def main(zone, model, hparam_set, problem, train_steps, ckpt_frequency, automation_test='0', run='0', profile='0'):
    temp = ''
    for c in hparam_set:
        if c == "_":
            temp += '-'
        else:
            temp += c
    name = zone + '-' + temp
    job = resource_acquisition.ResourceManager()
    job.create_instance(COMPUTE, PROJECTNAME, zone, name, False, '4', True)
    request = COMPUTE.instances().get(project=PROJECTNAME, zone=zone, instance=name)
    result = request.execute()
    command = "echo VM READY"
    username = "******"
    home_var = os.environ['HOME']
    while result['status'] != 'RUNNING':
        request = COMPUTE.instances().get(project=PROJECTNAME, zone=zone, instance=name)
        result = request.execute()
    ip = result['networkInterfaces'][0]['accessConfigs'][0]['natIP']
    port = 22
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    client.set_missing_host_key_policy(paramiko.WarningPolicy)
    while True:
        try:
            client.connect(ip, port=port, username=username, password=None,
                           key_filename=home_var + "/.ssh/google_compute_engine")

            stdin, stdout, stderr = client.exec_command(command)
            break
        except (paramiko.ssh_exception.BadHostKeyException, paramiko.ssh_exception.AuthenticationException,
                paramiko.ssh_exception.SSHException, paramiko.ssh_exception.socket.error) as e:
            print "Retrying SSH to VM"
            time.sleep(1)
    client.close()

    subprocess.call(
        ["./start_one_time_training.sh", name, '1', '0', '1', 'gs://shijian-18-ml', model, hparam_set,
         problem, train_steps, ckpt_frequency, '0', '0', '0'])

    os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/ping.txt ${HOME}/desktop/spotTrain_data/1_29/"+zone+"_"+hparam_set+"_ping.txt")
    os.system("gcloud compute scp --zone " + zone + " ozymandias@" + name + ":~/iperf3.txt ${HOME}/desktop/spotTrain_data/1_29/" + zone + "_" + hparam_set + "_iperf3.txt")

    os.system("gcloud compute instances delete --zone " + zone + " " + name + " -q &")
Esempio n. 7
0
def main(job_name, num_ps, ps_core_num, num_worker, num_shard, bucket_dir,
         model, hparam_set, problem, train_steps, ckpt_frequency,
         automation_test, profile, limit, setSlot, maxWorker, gpu):
    job = resource_acquisition.ResourceManager()
    server_lists = job.acquire_resource(job_name,
                                        int(num_ps),
                                        int(ps_core_num),
                                        int(num_worker),
                                        limit,
                                        zone='us-west1-b',
                                        gpu_type=gpu)
    worker_temp = str(int(num_worker) - 1)
    subprocess.call([
        "./start_one_time_training.sh", job_name, num_ps, worker_temp,
        num_shard, bucket_dir, model, hparam_set, problem, train_steps,
        ckpt_frequency, automation_test, '0',
        str(profile),
        str(setSlot),
        str(maxWorker)
    ])