예제 #1
0
def main(job_id, params):
    job_id = str(job_id)
    command = []
    print(params)
    my_params={}
    # Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"','\'')] = int(params[key])
    my_params = json.dumps(my_params)
    dico_to_save = {}
    dico_to_save['job']= job_id
    dico_to_save['params'] = my_params
    name= create_name(my_params)

    file_to_send = PATH_TO_HACONE + "/jobs/job{}.txt".format(name)
    file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id)
    with open(file_to_send, 'wb') as fp:
        json.dump(dico_to_save, fp)

    # now we need to send this file : job_file to the AWS instance.
    subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format(private_key, file_to_send, public_DNS, job_id), shell=True)

    command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH")
    # Launch the train and eval script

    command.append("./train_eval_image_classifier_bis.py 7 {} --max_number_of_steps=28125".format(job_id))
    print(command)

    command_id = ssm.run_command(instance_id,command)

    print('Training launched')
    listen_to_remote_process(file_to_get_back)
    print('Training finished')

    subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output/{}/".format(private_key, public_DNS, job_id, PATH_TO_HACONE, name), shell=True)
    subprocess.call("scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt".format(private_key, public_DNS, job_id,PATH_TO_HACONE, name), shell=True)
    output_dir = os.path.join('outputs',str(job_id))

    dirname = buckets.download_from_s3('astar-trainedmodels')

    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt'.format(name)) as my_file:
        dico = json.load(my_file)

    print(file_to_send)

    subprocess.call("scp {} {}@{}:/home/{}".format(file_to_send, user, server_DNS,user),shell=True)
    subprocess.call("ssh {}@{} './monitor_inference_4_GPU.py {} {}\'".format(user, server_DNS, name, job_id), shell=True)
    subprocess.call("scp {}@{}:/home/arturo/hardware_metrics_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt ".format(user, server_DNS, job_id, PATH_TO_HACONE,name), shell=True)
    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name)) as my_file:
        dico_hardware = json.load(my_file)
    for key, value in dico.items():
        dico_hardware["{}".format(key)] = value


    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name), 'w') as my_file:
        json.dump(dico_hardware, my_file)
    return dico_hardware
def main(job_id, params):
    job_id = str(job_id)
    command = []
    print(params)
    my_params={}
    # Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"','\'')] = int(params[key])
    my_params = json.dumps(my_params)
    dico_to_save = {}
    dico_to_save['job']= job_id
    dico_to_save['params'] = my_params
    file_to_send = PATH_TO_HACONE + "/jobs/job_file_{}.txt".format(job_id)
    with open(file_to_send, 'wb') as fp:
        json.dump(dico_to_save, fp)

    # now we need to send this file : job_file to the AWS instance.
    subprocess.call("scp -i {} {} ubuntu@{}:~/job_file.txt".format(private_key, file_to_send, public_DNS), shell=True)

    command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH")
    # Launch the train and eval script

    command.append("./train_eval_image_classifier_bis.py")
    print(command)

    command_id = ssm.run_command(instance_id,command)

    dic = {}

    while True:
        dic = ssm.notif_listener(dic);
        print('\n Global dic : ',dic,'\n')

        if dic[command_id]!='InProgress':
            print(dic[command_id])
            command_2 = []

            if dic[command_id]=='Success':
                subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/".format(private_key, public_DNS, job_id), shell=True)
                subprocess.call("scp -i {} ubuntu@{}:~/accuracy.txt /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt".format(private_key, public_DNS, job_id), shell=True)
                output_dir = os.path.join('outputs',str(job_id))

                #go to the output dir and copy everything to a bucket, then remove everything from the instance
                command_2.append('cd {}'.format(output_dir))
                command_2.append('aws s3 cp ./{} s3://astar-trainedmodels/{}/{}/awsrunShellScript/0.awsrunShellScript/ --recursive'.format(output_dir,command_id,instance_id))
                command_2.append('rm -rf ./{}'.format(output_dir))
                command_id_2 = ssm.run_command(instance_id,command_2,save_bucket=False)

                while True:
                    dic = ssm.notif_listener(dic)
                    if dic[command_id_2]!='InProgress':
                        print(dic[command_id_2])
                        dic.pop(command_id_2,None)
                        break

            dirname = buckets.download_from_s3('astar-trainedmodels')
            dic.pop(command_id,None)
            with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt'.format(job_id)) as my_file:
                dico = json.load(my_file)
            break
    dic = ssm.notif_listener(dic)
    #Now it's time to find the hardware characterictics : time, memory, power.
    subprocess.call(PATH_TO_HACONE + "/monitoring_GPU/monitor_inference.py {}".format(job_id), shell=True)
    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id)) as my_file:
        dico_hardware = json.load(my_file)
    for key, value in dico.items():
        dico_hardware["{}".format(key)] = value


    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id), 'w') as my_file:
        json.dump(dico_hardware, my_file)
    return dico_hardware