Beispiel #1
0
def resume_instance(inst_ids, ec2):
    """인스턴스 재개."""
    warning("resume_instance: '{}'".format(inst_ids))

    # 권한 확인
    try:
        ec2.start_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise

    # 재개
    while True:
        try:
            response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False)
            info(response)
        except botocore.exceptions.ClientError as e:
            msg = str(e)
            if 'is not in a state' not in msg:
                error(msg)
                raise
            time.sleep(5)
        else:
            break
Beispiel #2
0
def create_dask_cluster(ec2, clinfo):
    """Dask 클러스터 생성.

    Args:
        ec2 (botocore.client.EC2): boto EC2 client
        clinfo (dict): 클러스터 정보
    """
    clname = clinfo['name']
    prefix = clinfo['profile'].get('instance_prefix')
    critical("Create dask cluster '{}'.".format(clname))
    clinfo['type'] = 'dask'

    # 스케쥴러/워커 생성
    tpl = clinfo['template']
    scd = create_inst(ec2, tpl, 'scheduler', clname, prefix)[0]
    wrks = create_inst(ec2, tpl, 'worker', clname, prefix)
    winsts = clinfo['instance']['workers'] = []

    # 사용 가능 상태까지 기다린 후 추가 정보 얻기.
    info("Wait for instance to be running.")
    scd = _wait_until_running(scd)
    clinfo['instance']['scheduler'] = instance_info(scd)
    for wrk in wrks:
        wrk = _wait_until_running(wrk)
        winsts.append(instance_info(wrk))

    save_cluster_info(clinfo)

    # ec2 생성 후 반환값의 `ncpu_options` 가 잘못오고 있어 여기서 요청.
    if len(wrks) > 0:
        # 첫 번째 워커의 ip
        wip = _get_ip(winsts[0], clinfo['profile'].get('private_command'))
        tpl['worker']['cpu_info'] = get_cpu_info(tpl['worker'], wip)
Beispiel #3
0
def read_profile(profile, params=None):
    """프로파일 읽기.

    Args:
        profile: 프로파일 명
        params: 덮어쓸 패러미터 정보

    Returns:
        dict: 프로파일 정보

    """
    info("read_profile {}".format(profile))
    path = check_profile(profile)
    with codecs.open(path, 'rb', encoding='utf-8') as f:
        body = f.read()
        pro = json.loads(body)

    # 프로파일 내용 검증
    validate_by_schema(pro)

    # Override 패러미터가 있으면 적용
    if params is not None:
        override_cfg_by_params(pro, params)
        # 덮어쓴 내용 검증
        try:
            validate_by_schema(pro)
        except jsonschema.exceptions.ValidationError:
            msgs = ["There is an incorrect parameter:"]
            for param in params:
                msgs.append('  {}'.format(param))
            raise RuntimeError('\n'.join(msgs))

    pro['name'] = os.path.basename(path)
    return pro
Beispiel #4
0
def _wait_until_running(inst):
    info(f"_wait_until_running: {inst}")
    try:
        inst.wait_until_running()
        inst.load()
    except Exception as e:
        error(str(e))
        raise e
    return inst
Beispiel #5
0
def _get_run_python(path, params):
    cmd = 'cd {} && '.format(NB_WORKDIR)

    for key, value in _iter_run_param(params):
        cmd += "{}={} ".format(key, value)

    cmd += "python {}".format(path)
    info("_get_run_python : {}".format(cmd))
    return cmd
Beispiel #6
0
def start_dask_cluster(clinfo):
    """Dask 클러스터 마스터/워커를 시작."""
    critical("Start dask scheduler & workers.")
    private_command = clinfo.get('private_command')

    # 스케쥴러 시작
    stpl = clinfo['template']['scheduler']
    user, private_key = stpl['ssh_user'], stpl['ssh_private_key']
    scd = clinfo['instance']['scheduler']
    sip = _get_ip(scd, private_command)
    scd_dns = scd['private_dns_name']
    cmd = "screen -S bilbo -d -m dask-scheduler"
    send_instance_cmd(user, private_key, sip, cmd)

    # AWS 크레덴셜 설치
    setup_aws_creds(user, private_key, sip)

    # 워커 실행 옵션 구하기
    wrks = clinfo['instance']['workers']
    wip = _get_ip(wrks[0], private_command)
    info("  Get worker memory from '{}'".format(wip))
    cmd = "free -b | grep 'Mem:' | awk '{print $2}'"
    stdouts, _ = send_instance_cmd(user, private_key, wip, cmd)
    memory = int(stdouts[0])
    wtpl = clinfo['template']['worker']
    nproc, nthread, memory = dask_worker_options(wtpl, memory)
    # 결정된 옵션 기록
    wtpl = clinfo['template']['worker']
    wtpl['nproc'] = nproc
    wtpl['nthread'] = nthread
    wtpl['memory'] = memory

    # 모든 워커들에 대해
    user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key']
    for wrk in wrks:
        wip = _get_ip(wrk, private_command)
        # AWS 크레덴셜 설치
        setup_aws_creds(user, private_key, wip)

        # 워커 시작
        opts = "--nprocs {} --nthreads {} --memory-limit {}".\
            format(nproc, nthread, memory)
        cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\
            format(scd_dns, opts)
        warning("  Worker options: {}".format(opts))
        send_instance_cmd(user, private_key, wip, cmd)

    # Dask 스케쥴러의 대쉬보드 기다림
    dash_url = 'http://{}:8787'.format(sip)
    clinfo['dask_dashboard_url'] = dash_url
    critical("Wait for Dask dashboard ready.")
    try:
        wait_until_connect(dash_url)
    except Exception as e:
        error(str(e))
        raise e
Beispiel #7
0
def open_url(url, cldata):
    """지정된 또는 기본 브라우저로 URL 열기."""
    info("open_url")
    wb = webbrowser
    if 'webbrowser' in cldata:
        path = cldata['webbrowser']
        info("  Using explicit web browser: {}".format(path))
        webbrowser.register('explicit', None,
                            webbrowser.BackgroundBrowser(path))
        wb = webbrowser.get('explicit')
    wb.open(url)
Beispiel #8
0
def wait_until_connect(url, retry_count=60):
    """URL 접속이 가능할 때까지 기다림."""
    info("wait_until_connect: {}".format(url))
    for i in range(retry_count):
        try:
            urlopen(url, timeout=8)
            return
        except (HTTPError, URLError, timeout):
            info("Can not connect to dashboard. Wait for a while.")
            time.sleep(TRY_SLEEP)

    raise ConnectionError()
Beispiel #9
0
def create_notebook(ec2, clinfo):
    """노트북 생성."""
    critical("Create notebook.")
    clname = clinfo['name']
    prefix = clinfo['profile'].get('instance_prefix')
    tpl = clinfo['template']
    nb = create_inst(ec2, tpl, 'notebook', clname, prefix)[0]

    info("Wait for notebook instance to be running.")
    nb = _wait_until_running(nb)
    clinfo['instance']['notebook'] = instance_info(nb)
    save_cluster_info(clinfo)
Beispiel #10
0
def get_root_dm(ec2, iinfo):
    """AMI 별 원하는 크기의 디바이스 매핑 얻기."""
    volsize = iinfo.get('vol_size')
    if volsize is None:
        return []
    imgs = list(ec2.images.filter(ImageIds=[iinfo['ami']]))
    if len(imgs) == 0:
        raise ValueError("AMI {} does not exist.".format(iinfo['ami']))
    rdev = imgs[0].root_device_name
    dm = [{"DeviceName": rdev, "Ebs": {"VolumeSize": volsize}}]
    info("get_root_dm: {}".format(dm))
    return dm
Beispiel #11
0
def get_cpu_info(tpl, ip):
    """생성된 인스턴스에서 lscpu 명령으로 CPU 정보 얻기."""
    info("get_cpu_info")
    user = tpl['ssh_user']
    private_key = tpl['ssh_private_key']
    # Cores
    cmd = "lscpu | grep -e ^CPU\(s\): | awk '{print $2}'"
    res, _ = send_instance_cmd(user, private_key, ip, cmd)
    num_core = int(res[0])
    # Threads per core
    cmd = "lscpu | grep Thread | awk '{print $4}'"
    res, _ = send_instance_cmd(user, private_key, ip, cmd)
    threads_per_core = int(res[0])
    cpu_info = {'CoreCount': num_core, 'ThreadsPerCore': threads_per_core}
    return cpu_info
Beispiel #12
0
def pause_instance(inst_ids):
    """인스턴스 정지."""
    warning("pause_instance: '{}'".format(inst_ids))
    ec2 = boto3.client('ec2')

    # 권한 확인
    try:
        ec2.stop_instances(InstanceIds=inst_ids, DryRun=True)
    except botocore.exceptions.ClientError as e:
        if 'DryRunOperation' not in str(e):
            error(str(e))
            raise e

    # 정지
    try:
        response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False)
        info(response)
    except botocore.exceptions.ClientError as e:
        error(str(e))
Beispiel #13
0
def check_cluster(clname):
    """프로파일을 확인.

    Args:
        clname (str): 클러스터명 (.json 확장자 제외)
    """
    info(f"check_cluster {clname}")
    if clname.lower().endswith('.json'):
        rname = '.'.join(clname.split('.')[0:-1])
        msg = "Wrong cluster name '{}'. Use '{}' instead.". \
              format(clname, rname)
        raise NameError(msg)

    # file existence
    path = os.path.join(clust_dir, clname + '.json')
    if not os.path.isfile(path):
        error("Cluster '{}' does not exist.".format(path))
        raise(FileNotFoundError(path))

    return path
Beispiel #14
0
def _get_run_notebook(path, nb_params, cmd_params=None):
    assert type(nb_params) in [list, tuple]

    tname = next(tempfile._get_candidate_names())
    tmp = '/tmp/{}'.format(tname)
    elms = path.split('.')
    out_path = '.'.join(elms[:-1]) + '.out.' + elms[-1]
    cmd = "cd {} && ".format(NB_WORKDIR)

    if cmd_params is not None:
        assert type(cmd_params) in [list, tuple]
        for key, value in _iter_run_param(cmd_params):
            cmd += "{}={} ".format(key, value)

    cmd += "papermill --cwd {} --no-progress-bar --stdout-file " \
        "{} {} {}".format(NB_WORKDIR, tmp, path, out_path)

    for key, value in _iter_run_param(nb_params):
        cmd += " -p {} {}".format(key, value)

    info("_get_run_notebook : {}".format(cmd))
    return cmd, tmp
Beispiel #15
0
def stop_notebook_or_python(clname, path, params):
    """실행한 노트북/파이썬 파일을 중단."""
    info("stop_notebook_or_python: {} - {}".format(clname, path))
    check_cluster(clname)
    clinfo = load_cluster_info(clname)
    private_command = clinfo.get('private_command')

    if 'notebook' not in clinfo:
        raise RuntimeError("No notebook instance.")

    ncfg = clinfo['notebook']
    user, private_key = ncfg['ssh_user'], ncfg['ssh_private_key']
    ip = _get_ip(ncfg, private_command)

    ext = path.split('.')[-1].lower()

    dask_scd_addr = _get_dask_scheduler_address(clinfo)
    # 노트북 파일
    if ext == 'ipynb':
        # Run by papermill
        _cmd, _ = _get_run_notebook(path, params, [dask_scd_addr])
        _cmd = _cmd.replace('papermill', '[p]apermill')
    # 파이썬 파일
    elif ext == 'py':
        params = list(params)
        params.insert(0, dask_scd_addr)
        _cmd = _get_run_python(path, params)
        _cmd = _cmd.replace('python', '[p]ython')
    else:
        raise RuntimeError("Unsupported file type: {}".format(path))

    # 실행된 프로세스 찾기
    cmd = "ps auxww | grep '{}' | awk '{{print $2}}' | head -n 1".format(_cmd)
    res, _ = send_instance_cmd(user, private_key, ip, cmd, show_stdout=False,
                               show_stderr=False)

    # 프로세스가 있으면 삭제
    if len(res) > 0:
        proc = res[0].strip()
        cmd = "pkill -P {}".format(proc)
        info("Delete process: {}".format(cmd))
        res, _ = send_instance_cmd(user, private_key, ip, cmd,
                                   show_stderr=False)
    else:
        info("No process exists.")
Beispiel #16
0
def run_notebook_or_python(clname, path, params):
    """원격 노트북 인스턴스에서 노트북 또는 파이썬 파일 실행.

    Returns:
        tuple: (stdout, exit_code)

    """
    info("run_notebook_or_python: {} - {}".format(clname, path))

    check_cluster(clname)
    clinfo = load_cluster_info(clname)
    dask = 'type' in clinfo and clinfo['type'] == 'dask'
    private_command = clinfo.get('private_command')

    if 'notebook' not in clinfo['instance']:
        raise RuntimeError("No notebook instance.")

    ntpl = clinfo['template']['notebook']
    user, private_key = ntpl['ssh_user'], ntpl['ssh_private_key']
    nb = clinfo['instance']['notebook']
    nip = _get_ip(nb, private_command)

    ext = path.split('.')[-1].lower()

    dask_scd_addr = None
    if dask:
        if 'scheduler' in clinfo['instance']:
            dask_scd_addr = _get_dask_scheduler_address(clinfo)
        else:
            for param in params:
                if param.startswith('DASK_SCHEDULER_ADDRESS'):
                    dask_scd_addr = param
        assert dask_scd_addr is not None, "No Dask scheduler address available."

    def _check_err(err):
        err = '\n'.join(err)
        if 'error' in err.lower():
            print(err)
            return True
        return False

    # 노트북 파일
    if ext == 'ipynb':
        cparams = []
        if dask:
            cparams.insert(0, dask_scd_addr)
        # Run by papermill
        cmd, tmp = _get_run_notebook(path, params, cparams)
        res, err, excode = run_cmd_and_store_result(clname, user, private_key, nip, cmd,
                                                    show_stdout=True, show_stderr=False)
        if not _check_err(err):
            cmd = 'cat {}'.format(tmp)
            res, err = send_instance_cmd(user, private_key, nip, cmd, show_stdout=True,
                                         show_stderr=False)
            if len(err) > 0 and 'No such file' not in err[0]:
                _check_err(err)
    # 파이썬 파일
    elif ext == 'py':
        params = list(params)
        if dask:
            params.insert(0, dask_scd_addr)
        cmd = _get_run_python(path, params)
        res, err, excode = run_cmd_and_store_result(clname, user, private_key, nip, cmd,
                                                    show_stdout=True)
        _check_err(err)
    else:
        raise RuntimeError("Unsupported file type: {}".format(path))

    return res, excode
Beispiel #17
0
def start_notebook(clinfo, retry_count=60):
    """노트북 시작.

    Args:
        clinfo (dict): 클러스터 생성 정보
        retry_count (int): 접속 URL 얻기 재시도 수. 기본 10

    Raises:
        TimeoutError: 재시도 수가 넘을 때

    """
    critical("Start notebook.")

    tpl = clinfo['template']['notebook']
    pro = clinfo['profile']
    user, private_key = tpl['ssh_user'], tpl['ssh_private_key']
    inst = clinfo['instance']['notebook']
    ip = _get_ip(inst, pro.get('private_command'))

    # AWS 크레덴셜 설치
    setup_aws_creds(user, private_key, ip)

    # 작업 폴더
    nb_workdir = tpl.get('workdir', NB_WORKDIR)
    cmd = "mkdir -p {}".format(nb_workdir)
    send_instance_cmd(user, private_key, ip, cmd)

    # git 설정이 있으면 설정
    if 'notebook' in pro and 'git' in pro['notebook']:
        nb_git = pro['notebook']['git']
        cloned_dir = setup_git(nb_git, user, private_key, ip, nb_workdir)
        clinfo['git_cloned_dir'] = cloned_dir

    # 클러스터 타입별 노트북 설정
    vars = ''
    if 'type' in clinfo:
        if clinfo['type'] == 'dask':
            # dask-labextension을 위한 대쉬보드 URL
            sip = clinfo['instance']['scheduler']['public_ip']
            cmd = "mkdir -p ~/.jupyter/lab/user-settings/dask-labextension; "
            cmd += 'echo \'{{ "defaultURL": "http://{}:8787" }}\' > ' \
                   '~/.jupyter/lab/user-settings/dask-labextension/' \
                   'plugin.jupyterlab-settings'.format(sip)
            send_instance_cmd(user, private_key, ip, cmd)
            # 스케쥴러 주소
            vars = _get_dask_scheduler_address(clinfo)
        else:
            raise NotImplementedError()

    # Jupyter 시작
    ncmd = "cd {} && {} jupyter lab --ip 0.0.0.0".format(nb_workdir, vars)
    cmd = "screen -S bilbo -d -m bash -c '{}'".format(ncmd)
    send_instance_cmd(user, private_key, ip, cmd)

    # 접속 URL 얻기
    cmd = "jupyter notebook list | awk '{print $1}'"
    for i in range(retry_count):
        stdouts, _ = send_instance_cmd(user, private_key, ip, cmd)
        # url을 얻었으면 기록
        if len(stdouts) > 1 and len(stdouts[1]) > 0:
            url = stdouts[1].strip().replace('0.0.0.0', inst['public_ip'])
            clinfo['notebook_url'] = url
            return
        info("Can not fetch notebook list. Wait for a while.")
        time.sleep(TRY_SLEEP)
    raise TimeoutError("Can not get notebook url.")
Beispiel #18
0
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd,
                      show_stdout=False, show_stderr=True, retry_count=30,
                      get_excode=False):
    """인스턴스에 SSH 명령어 실행

    https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3

    Args:
        ssh_user (str): SSH 유저
        ssh_private_key (str): SSH Private Key 경로
        ip (str): 대상 인스턴스의 IP
        cmd (list): 커맨드 문자열 리스트
        show_stdout (bool): 표준 출력 메시지 출력 여부
        show_stderr (bool): 에러 메시지 출력 여부
        retry_count (int): 재시도 횟수
        get_excode (bool): exit code 체크 여부. 기본 False

    Returns:
        tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는
            stdout, stderr. 하는 경우는 stdout, stderr, exit_code
    """
    info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}'
         .format(ssh_user, ssh_private_key, ip, cmd))

    key_path = expanduser(ssh_private_key)

    key = paramiko.RSAKey.from_private_key_file(key_path)
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

    connected = False
    for i in range(retry_count):
        try:
            client.connect(hostname=ip, username=ssh_user, pkey=key)
        except (paramiko.ssh_exception.NoValidConnectionsError,
            TimeoutError, BlockingIOError):
            warning("Connection failed to '{}'. Retry after a while.".
                    format(ip))
            time.sleep(TRY_SLEEP)
        else:
            connected = True
            break

    if not connected:
        error("Connection failed to '{}'".format(ip))
        return

    stdouts = []
    stderrs = []
    done_file = '/tmp/bilbo_rcmd_done'
    if get_excode:
        # embed exit code file
        cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}"

    # 인터랙티브 모드
    transport = client.get_transport()
    transport.set_keepalive(60)
    channel = transport.open_session()
    channel.exec_command(cmd)
    while True:
        time.sleep(0.1)
        if channel.recv_ready():
            recv = channel.recv(4096).decode('utf-8')
            stdouts.append(recv)
            if show_stdout:
                print(recv, end="")

        if channel.recv_stderr_ready():
            recv = channel.recv_stderr(4096).decode('utf-8')
            stderrs.append(recv)

        if channel.exit_status_ready():
            break

    stdouts = ''.join(stdouts).split('\n')
    stderr = ''.join(stderrs)

    if show_stderr and len(stderr) > 0:
        error(stderr)

    client.close()

    if get_excode:
        ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi'
        out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd)
        try:
            excode = int(out[0])
        except ValueError:
            excode = -1
        return stdouts, stderr, excode
    else:
        return stdouts, stderr