예제 #1
0
파일: cluster.py 프로젝트: haje01/bilbo
def resume_instance(inst_ids, ec2):
    """인스턴스 재개."""
    warning("resume_instance: '{}'".format(inst_ids))

    # 권한 확인
    try:
        ec2.start_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise

    # 재개
    while True:
        try:
            response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False)
            info(response)
        except botocore.exceptions.ClientError as e:
            msg = str(e)
            if 'is not in a state' not in msg:
                error(msg)
                raise
            time.sleep(5)
        else:
            break
예제 #2
0
파일: cluster.py 프로젝트: haje01/bilbo
def _wait_until_running(inst):
    info(f"_wait_until_running: {inst}")
    try:
        inst.wait_until_running()
        inst.load()
    except Exception as e:
        error(str(e))
        raise e
    return inst
예제 #3
0
파일: cluster.py 프로젝트: haje01/bilbo
def start_dask_cluster(clinfo):
    """Dask 클러스터 마스터/워커를 시작."""
    critical("Start dask scheduler & workers.")
    private_command = clinfo.get('private_command')

    # 스케쥴러 시작
    stpl = clinfo['template']['scheduler']
    user, private_key = stpl['ssh_user'], stpl['ssh_private_key']
    scd = clinfo['instance']['scheduler']
    sip = _get_ip(scd, private_command)
    scd_dns = scd['private_dns_name']
    cmd = "screen -S bilbo -d -m dask-scheduler"
    send_instance_cmd(user, private_key, sip, cmd)

    # AWS 크레덴셜 설치
    setup_aws_creds(user, private_key, sip)

    # 워커 실행 옵션 구하기
    wrks = clinfo['instance']['workers']
    wip = _get_ip(wrks[0], private_command)
    info("  Get worker memory from '{}'".format(wip))
    cmd = "free -b | grep 'Mem:' | awk '{print $2}'"
    stdouts, _ = send_instance_cmd(user, private_key, wip, cmd)
    memory = int(stdouts[0])
    wtpl = clinfo['template']['worker']
    nproc, nthread, memory = dask_worker_options(wtpl, memory)
    # 결정된 옵션 기록
    wtpl = clinfo['template']['worker']
    wtpl['nproc'] = nproc
    wtpl['nthread'] = nthread
    wtpl['memory'] = memory

    # 모든 워커들에 대해
    user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key']
    for wrk in wrks:
        wip = _get_ip(wrk, private_command)
        # AWS 크레덴셜 설치
        setup_aws_creds(user, private_key, wip)

        # 워커 시작
        opts = "--nprocs {} --nthreads {} --memory-limit {}".\
            format(nproc, nthread, memory)
        cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\
            format(scd_dns, opts)
        warning("  Worker options: {}".format(opts))
        send_instance_cmd(user, private_key, wip, cmd)

    # Dask 스케쥴러의 대쉬보드 기다림
    dash_url = 'http://{}:8787'.format(sip)
    clinfo['dask_dashboard_url'] = dash_url
    critical("Wait for Dask dashboard ready.")
    try:
        wait_until_connect(dash_url)
    except Exception as e:
        error(str(e))
        raise e
예제 #4
0
파일: cluster.py 프로젝트: haje01/bilbo
def open_notebook(clname, url_only=False):
    """노트북 열기."""
    warning(f"open_notebook {clname}")
    check_cluster(clname)
    clinfo = load_cluster_info(clname)

    if 'notebook_url' in clinfo:
        url = clinfo['notebook_url']
        if url_only:
            print(url)
        else:
            open_url(url, clinfo)
    else:
        error("no notebook instance.")
        raise Exception("No notebook instance.")
예제 #5
0
파일: cluster.py 프로젝트: haje01/bilbo
def create_ec2_instances(ec2, tpl, cnt, tag_spec):
    """EC2 인스턴스 생성."""
    rdm = get_root_dm(ec2, tpl)

    try:
        ins = ec2.create_instances(ImageId=tpl['ami'],
                                   InstanceType=tpl['ec2type'],
                                   MinCount=cnt, MaxCount=cnt,
                                   KeyName=tpl['keyname'],
                                   BlockDeviceMappings=rdm,
                                   SecurityGroupIds=[tpl['security_group']],
                                   TagSpecifications=tag_spec)
        return ins
    except botocore.exceptions.ClientError as e:
        error("create_ec2_instances - {}".format(str(e)))
        if 'Request would have succeeded' not in str(e):
            raise e
예제 #6
0
def check_profile(proname):
    """프로파일을 확인.

    Args:
        proname (str): 프로파일명 (.json 확장자 포함)
    """
    if not proname.lower().endswith('.json'):
        msg = "Wrong profile name '{}'. Use '{}.json' instead.". \
              format(proname, proname)
        raise NameError(msg)

    # file existence
    path = os.path.join(prof_dir, proname)
    if not os.path.isfile(path):
        error("Profile '{}' does not exist.".format(path))
        raise (FileNotFoundError(path))

    return path
예제 #7
0
파일: cluster.py 프로젝트: haje01/bilbo
def pause_instance(inst_ids):
    """인스턴스 정지."""
    warning("pause_instance: '{}'".format(inst_ids))
    ec2 = boto3.client('ec2')

    # 권한 확인
    try:
        ec2.stop_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise e

    # 정지
    try:
        response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False)
        info(response)
    except botocore.exceptions.ClientError as e:
        error(str(e))
예제 #8
0
파일: cluster.py 프로젝트: haje01/bilbo
def check_cluster(clname):
    """프로파일을 확인.

    Args:
        clname (str): 클러스터명 (.json 확장자 제외)
    """
    info(f"check_cluster {clname}")
    if clname.lower().endswith('.json'):
        rname = '.'.join(clname.split('.')[0:-1])
        msg = "Wrong cluster name '{}'. Use '{}' instead.". \
              format(clname, rname)
        raise NameError(msg)

    # file existence
    path = os.path.join(clust_dir, clname + '.json')
    if not os.path.isfile(path):
        error("Cluster '{}' does not exist.".format(path))
        raise(FileNotFoundError(path))

    return path
예제 #9
0
파일: cluster.py 프로젝트: haje01/bilbo
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd,
                      show_stdout=False, show_stderr=True, retry_count=30,
                      get_excode=False):
    """인스턴스에 SSH 명령어 실행

    https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3

    Args:
        ssh_user (str): SSH 유저
        ssh_private_key (str): SSH Private Key 경로
        ip (str): 대상 인스턴스의 IP
        cmd (list): 커맨드 문자열 리스트
        show_stdout (bool): 표준 출력 메시지 출력 여부
        show_stderr (bool): 에러 메시지 출력 여부
        retry_count (int): 재시도 횟수
        get_excode (bool): exit code 체크 여부. 기본 False

    Returns:
        tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는
            stdout, stderr. 하는 경우는 stdout, stderr, exit_code
    """
    info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}'
         .format(ssh_user, ssh_private_key, ip, cmd))

    key_path = expanduser(ssh_private_key)

    key = paramiko.RSAKey.from_private_key_file(key_path)
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

    connected = False
    for i in range(retry_count):
        try:
            client.connect(hostname=ip, username=ssh_user, pkey=key)
        except (paramiko.ssh_exception.NoValidConnectionsError,
            TimeoutError, BlockingIOError):
            warning("Connection failed to '{}'. Retry after a while.".
                    format(ip))
            time.sleep(TRY_SLEEP)
        else:
            connected = True
            break

    if not connected:
        error("Connection failed to '{}'".format(ip))
        return

    stdouts = []
    stderrs = []
    done_file = '/tmp/bilbo_rcmd_done'
    if get_excode:
        # embed exit code file
        cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}"

    # 인터랙티브 모드
    transport = client.get_transport()
    transport.set_keepalive(60)
    channel = transport.open_session()
    channel.exec_command(cmd)
    while True:
        time.sleep(0.1)
        if channel.recv_ready():
            recv = channel.recv(4096).decode('utf-8')
            stdouts.append(recv)
            if show_stdout:
                print(recv, end="")

        if channel.recv_stderr_ready():
            recv = channel.recv_stderr(4096).decode('utf-8')
            stderrs.append(recv)

        if channel.exit_status_ready():
            break

    stdouts = ''.join(stdouts).split('\n')
    stderr = ''.join(stderrs)

    if show_stderr and len(stderr) > 0:
        error(stderr)

    client.close()

    if get_excode:
        ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi'
        out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd)
        try:
            excode = int(out[0])
        except ValueError:
            excode = -1
        return stdouts, stderr, excode
    else:
        return stdouts, stderr