Esempio n. 1
0
def show_cluster(clname, detail=False):
    """클러스터 정보를 표시."""
    warning(f"show_cluster {clname}")
    check_cluster(clname)
    if detail:
        clinfo = load_cluster_info(clname)
        pprint(clinfo)
        return

    clinfo = load_cluster_info(clname)

    print()
    print("Cluster Name: {}".format(clinfo['name']))
    print("Ready Time: {}".format(clinfo['saved_time']))

    insts = clinfo['instance']
    idx = 1
    if 'notebook' in insts:
        print()
        print("Notebook:")
        idx = show_instance(idx, insts['notebook'])
        print()

    if 'type' in clinfo:
        cltype = clinfo['type']
        print("Cluster Type: {}".format(cltype))
        if cltype == 'dask':
            show_dask_cluster(idx, clinfo)
        else:
            raise NotImplementedError()
    print()
Esempio n. 2
0
def git_clone_cmd(repo, user, passwd, workdir):
    """Git 클론 명령 구성"""
    warning("git clone: {}".format(repo))
    protocol, address = repo.split('://')
    url = "{}://{}:{}@{}".format(protocol, user, passwd, address)
    cmd = "cd {} && git clone {}".format(workdir, url)
    return cmd
Esempio n. 3
0
def destroy_cluster(clname, force):
    """클러스터 제거."""
    warning(f"destroy_cluster {clname}")
    check_cluster(clname)
    clinfo = load_cluster_info(clname)

    if 'git_cloned_dir' in clinfo and not force:
        if not check_git_modified(clinfo):
            print("Canceled.")
            return

    critical("Destroy cluster '{}'.".format(clname))

    # 인스턴스 제거
    ec2 = boto3.client('ec2')
    inst_ids = []
    for k, v in clinfo['instance'].items():
        if k == 'workers':
            inst_ids += [w['instance_id'] for w in v]
        else:
            inst_ids.append(v['instance_id'])
    if len(inst_ids) > 0:
        ec2.terminate_instances(InstanceIds=inst_ids)

    # 클러스터 파일 제거
    path = os.path.join(clust_dir, clname + '.json')
    os.unlink(path)
Esempio n. 4
0
def resume_instance(inst_ids, ec2):
    """인스턴스 재개."""
    warning("resume_instance: '{}'".format(inst_ids))

    # 권한 확인
    try:
        ec2.start_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise

    # 재개
    while True:
        try:
            response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False)
            info(response)
        except botocore.exceptions.ClientError as e:
            msg = str(e)
            if 'is not in a state' not in msg:
                error(msg)
                raise
            time.sleep(5)
        else:
            break
Esempio n. 5
0
def init_instances(clinfo):
    """인스턴스 공통 초기화."""
    warning("init_instances")
    tpl = clinfo['template']
    insts = clinfo['instance']

    # 초기화 명령
    def _run_init_cmd(role):
        rtpl = tpl[role]
        user, private_key = rtpl['ssh_user'], rtpl['ssh_private_key']
        if 'init_cmd' in rtpl:
            cmds = rtpl['init_cmd']
            if role == 'worker':
                for winst in insts['workers']:
                    ip = winst['public_ip']
                    _send_cmd(user, private_key, ip, cmds)
            else:
                ip = insts[role]['public_ip']
                _send_cmd(user, private_key, ip, cmds)

    def _send_cmd(user, private_key, ip, cmds):
        for cmd in cmds:
            send_instance_cmd(user, private_key, ip, cmd)

    if 'notebook' in tpl:
        _run_init_cmd('notebook')
    if 'scheduler' in tpl:
        _run_init_cmd('scheduler')
    if 'worker' in tpl:
        _run_init_cmd('worker')
Esempio n. 6
0
def load_cluster_info(clname):
    """클러스터 정보파일 읽기."""
    warning("load_cluster_info: '{}'".format(clname))
    path = os.path.join(clust_dir, clname + '.json')
    with open(path, 'rt') as f:
        body = f.read()
        clinfo = json.loads(body)
    return clinfo
Esempio n. 7
0
def start_dask_cluster(clinfo):
    """Dask 클러스터 마스터/워커를 시작."""
    critical("Start dask scheduler & workers.")
    private_command = clinfo.get('private_command')

    # 스케쥴러 시작
    stpl = clinfo['template']['scheduler']
    user, private_key = stpl['ssh_user'], stpl['ssh_private_key']
    scd = clinfo['instance']['scheduler']
    sip = _get_ip(scd, private_command)
    scd_dns = scd['private_dns_name']
    cmd = "screen -S bilbo -d -m dask-scheduler"
    send_instance_cmd(user, private_key, sip, cmd)

    # AWS 크레덴셜 설치
    setup_aws_creds(user, private_key, sip)

    # 워커 실행 옵션 구하기
    wrks = clinfo['instance']['workers']
    wip = _get_ip(wrks[0], private_command)
    info("  Get worker memory from '{}'".format(wip))
    cmd = "free -b | grep 'Mem:' | awk '{print $2}'"
    stdouts, _ = send_instance_cmd(user, private_key, wip, cmd)
    memory = int(stdouts[0])
    wtpl = clinfo['template']['worker']
    nproc, nthread, memory = dask_worker_options(wtpl, memory)
    # 결정된 옵션 기록
    wtpl = clinfo['template']['worker']
    wtpl['nproc'] = nproc
    wtpl['nthread'] = nthread
    wtpl['memory'] = memory

    # 모든 워커들에 대해
    user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key']
    for wrk in wrks:
        wip = _get_ip(wrk, private_command)
        # AWS 크레덴셜 설치
        setup_aws_creds(user, private_key, wip)

        # 워커 시작
        opts = "--nprocs {} --nthreads {} --memory-limit {}".\
            format(nproc, nthread, memory)
        cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\
            format(scd_dns, opts)
        warning("  Worker options: {}".format(opts))
        send_instance_cmd(user, private_key, wip, cmd)

    # Dask 스케쥴러의 대쉬보드 기다림
    dash_url = 'http://{}:8787'.format(sip)
    clinfo['dask_dashboard_url'] = dash_url
    critical("Wait for Dask dashboard ready.")
    try:
        wait_until_connect(dash_url)
    except Exception as e:
        error(str(e))
        raise e
Esempio n. 8
0
def open_notebook(clname, url_only=False):
    """노트북 열기."""
    warning(f"open_notebook {clname}")
    check_cluster(clname)
    clinfo = load_cluster_info(clname)

    if 'notebook_url' in clinfo:
        url = clinfo['notebook_url']
        if url_only:
            print(url)
        else:
            open_url(url, clinfo)
    else:
        error("no notebook instance.")
        raise Exception("No notebook instance.")
Esempio n. 9
0
def save_cluster_info(clinfo):
    """클러스터 정보파일 쓰기."""
    clname = clinfo['name']

    def json_default(value):
        if isinstance(value, datetime.date):
            return value.strftime('%Y-%m-%d %H:%M:%S')
        raise TypeError('not JSON serializable')

    warning("save_cluster_info: '{}'".format(clname))
    clinfo['saved_time'] = str(datetime.datetime.now())

    path = os.path.join(clust_dir, clname + '.json')
    with open(path, 'wt') as f:
        body = json.dumps(clinfo, default=json_default, indent=4,
                          sort_keys=True, ensure_ascii=False)
        f.write(body)
Esempio n. 10
0
def pause_instance(inst_ids):
    """인스턴스 정지."""
    warning("pause_instance: '{}'".format(inst_ids))
    ec2 = boto3.client('ec2')

    # 권한 확인
    try:
        ec2.stop_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise e

    # 정지
    try:
        response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False)
        info(response)
    except botocore.exceptions.ClientError as e:
        error(str(e))
Esempio n. 11
0
def _update_cluster_info(ec2, clname, inst_ids, clinfo):
    # 정보 갱신 대기
    while True:
        ready = True
        warning("Wait until available.")
        time.sleep(5)
        res = ec2.describe_instances(InstanceIds=inst_ids)
        for inst in res['Reservations'][0]['Instances']:
            if 'PublicIpAddress' not in inst:
                ready = False
                print(inst)
                break
        if ready:
            break

    # 바뀐 정보 갱신
    for reserv in res['Reservations']:
        inst = reserv['Instances'][0]
        insts = clinfo['instance']
        if 'notebook' in insts:
            nb = insts['notebook']
            if nb['instance_id'] == inst['InstanceId']:
                new_ip = inst['PublicIpAddress']
                nb['public_ip'] = new_ip
        if 'scheduler' in insts:
            scd = insts['scheduler']
            if scd['instance_id'] == inst['InstanceId']:
                scd['public_ip'] = inst['PublicIpAddress']
        if 'workers' in insts:
            wrks = insts['workers']
            for wrk in wrks:
                if wrk['instance_id'] == inst['InstanceId']:
                    wrk['public_ip'] = inst['PublicIpAddress']

    save_cluster_info(clinfo)
    return clinfo
Esempio n. 12
0
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd,
                      show_stdout=False, show_stderr=True, retry_count=30,
                      get_excode=False):
    """인스턴스에 SSH 명령어 실행

    https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3

    Args:
        ssh_user (str): SSH 유저
        ssh_private_key (str): SSH Private Key 경로
        ip (str): 대상 인스턴스의 IP
        cmd (list): 커맨드 문자열 리스트
        show_stdout (bool): 표준 출력 메시지 출력 여부
        show_stderr (bool): 에러 메시지 출력 여부
        retry_count (int): 재시도 횟수
        get_excode (bool): exit code 체크 여부. 기본 False

    Returns:
        tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는
            stdout, stderr. 하는 경우는 stdout, stderr, exit_code
    """
    info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}'
         .format(ssh_user, ssh_private_key, ip, cmd))

    key_path = expanduser(ssh_private_key)

    key = paramiko.RSAKey.from_private_key_file(key_path)
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

    connected = False
    for i in range(retry_count):
        try:
            client.connect(hostname=ip, username=ssh_user, pkey=key)
        except (paramiko.ssh_exception.NoValidConnectionsError,
            TimeoutError, BlockingIOError):
            warning("Connection failed to '{}'. Retry after a while.".
                    format(ip))
            time.sleep(TRY_SLEEP)
        else:
            connected = True
            break

    if not connected:
        error("Connection failed to '{}'".format(ip))
        return

    stdouts = []
    stderrs = []
    done_file = '/tmp/bilbo_rcmd_done'
    if get_excode:
        # embed exit code file
        cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}"

    # 인터랙티브 모드
    transport = client.get_transport()
    transport.set_keepalive(60)
    channel = transport.open_session()
    channel.exec_command(cmd)
    while True:
        time.sleep(0.1)
        if channel.recv_ready():
            recv = channel.recv(4096).decode('utf-8')
            stdouts.append(recv)
            if show_stdout:
                print(recv, end="")

        if channel.recv_stderr_ready():
            recv = channel.recv_stderr(4096).decode('utf-8')
            stderrs.append(recv)

        if channel.exit_status_ready():
            break

    stdouts = ''.join(stdouts).split('\n')
    stderr = ''.join(stderrs)

    if show_stderr and len(stderr) > 0:
        error(stderr)

    client.close()

    if get_excode:
        ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi'
        out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd)
        try:
            excode = int(out[0])
        except ValueError:
            excode = -1
        return stdouts, stderr, excode
    else:
        return stdouts, stderr