def resume_instance(inst_ids, ec2): """인스턴스 재개.""" warning("resume_instance: '{}'".format(inst_ids)) # 권한 확인 try: ec2.start_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise # 재개 while True: try: response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: msg = str(e) if 'is not in a state' not in msg: error(msg) raise time.sleep(5) else: break
def create_dask_cluster(ec2, clinfo): """Dask 클러스터 생성. Args: ec2 (botocore.client.EC2): boto EC2 client clinfo (dict): 클러스터 정보 """ clname = clinfo['name'] prefix = clinfo['profile'].get('instance_prefix') critical("Create dask cluster '{}'.".format(clname)) clinfo['type'] = 'dask' # 스케쥴러/워커 생성 tpl = clinfo['template'] scd = create_inst(ec2, tpl, 'scheduler', clname, prefix)[0] wrks = create_inst(ec2, tpl, 'worker', clname, prefix) winsts = clinfo['instance']['workers'] = [] # 사용 가능 상태까지 기다린 후 추가 정보 얻기. info("Wait for instance to be running.") scd = _wait_until_running(scd) clinfo['instance']['scheduler'] = instance_info(scd) for wrk in wrks: wrk = _wait_until_running(wrk) winsts.append(instance_info(wrk)) save_cluster_info(clinfo) # ec2 생성 후 반환값의 `ncpu_options` 가 잘못오고 있어 여기서 요청. if len(wrks) > 0: # 첫 번째 워커의 ip wip = _get_ip(winsts[0], clinfo['profile'].get('private_command')) tpl['worker']['cpu_info'] = get_cpu_info(tpl['worker'], wip)
def read_profile(profile, params=None): """프로파일 읽기. Args: profile: 프로파일 명 params: 덮어쓸 패러미터 정보 Returns: dict: 프로파일 정보 """ info("read_profile {}".format(profile)) path = check_profile(profile) with codecs.open(path, 'rb', encoding='utf-8') as f: body = f.read() pro = json.loads(body) # 프로파일 내용 검증 validate_by_schema(pro) # Override 패러미터가 있으면 적용 if params is not None: override_cfg_by_params(pro, params) # 덮어쓴 내용 검증 try: validate_by_schema(pro) except jsonschema.exceptions.ValidationError: msgs = ["There is an incorrect parameter:"] for param in params: msgs.append(' {}'.format(param)) raise RuntimeError('\n'.join(msgs)) pro['name'] = os.path.basename(path) return pro
def _wait_until_running(inst): info(f"_wait_until_running: {inst}") try: inst.wait_until_running() inst.load() except Exception as e: error(str(e)) raise e return inst
def _get_run_python(path, params): cmd = 'cd {} && '.format(NB_WORKDIR) for key, value in _iter_run_param(params): cmd += "{}={} ".format(key, value) cmd += "python {}".format(path) info("_get_run_python : {}".format(cmd)) return cmd
def start_dask_cluster(clinfo): """Dask 클러스터 마스터/워커를 시작.""" critical("Start dask scheduler & workers.") private_command = clinfo.get('private_command') # 스케쥴러 시작 stpl = clinfo['template']['scheduler'] user, private_key = stpl['ssh_user'], stpl['ssh_private_key'] scd = clinfo['instance']['scheduler'] sip = _get_ip(scd, private_command) scd_dns = scd['private_dns_name'] cmd = "screen -S bilbo -d -m dask-scheduler" send_instance_cmd(user, private_key, sip, cmd) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, sip) # 워커 실행 옵션 구하기 wrks = clinfo['instance']['workers'] wip = _get_ip(wrks[0], private_command) info(" Get worker memory from '{}'".format(wip)) cmd = "free -b | grep 'Mem:' | awk '{print $2}'" stdouts, _ = send_instance_cmd(user, private_key, wip, cmd) memory = int(stdouts[0]) wtpl = clinfo['template']['worker'] nproc, nthread, memory = dask_worker_options(wtpl, memory) # 결정된 옵션 기록 wtpl = clinfo['template']['worker'] wtpl['nproc'] = nproc wtpl['nthread'] = nthread wtpl['memory'] = memory # 모든 워커들에 대해 user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key'] for wrk in wrks: wip = _get_ip(wrk, private_command) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, wip) # 워커 시작 opts = "--nprocs {} --nthreads {} --memory-limit {}".\ format(nproc, nthread, memory) cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\ format(scd_dns, opts) warning(" Worker options: {}".format(opts)) send_instance_cmd(user, private_key, wip, cmd) # Dask 스케쥴러의 대쉬보드 기다림 dash_url = 'http://{}:8787'.format(sip) clinfo['dask_dashboard_url'] = dash_url critical("Wait for Dask dashboard ready.") try: wait_until_connect(dash_url) except Exception as e: error(str(e)) raise e
def open_url(url, cldata): """지정된 또는 기본 브라우저로 URL 열기.""" info("open_url") wb = webbrowser if 'webbrowser' in cldata: path = cldata['webbrowser'] info(" Using explicit web browser: {}".format(path)) webbrowser.register('explicit', None, webbrowser.BackgroundBrowser(path)) wb = webbrowser.get('explicit') wb.open(url)
def wait_until_connect(url, retry_count=60): """URL 접속이 가능할 때까지 기다림.""" info("wait_until_connect: {}".format(url)) for i in range(retry_count): try: urlopen(url, timeout=8) return except (HTTPError, URLError, timeout): info("Can not connect to dashboard. Wait for a while.") time.sleep(TRY_SLEEP) raise ConnectionError()
def create_notebook(ec2, clinfo): """노트북 생성.""" critical("Create notebook.") clname = clinfo['name'] prefix = clinfo['profile'].get('instance_prefix') tpl = clinfo['template'] nb = create_inst(ec2, tpl, 'notebook', clname, prefix)[0] info("Wait for notebook instance to be running.") nb = _wait_until_running(nb) clinfo['instance']['notebook'] = instance_info(nb) save_cluster_info(clinfo)
def get_root_dm(ec2, iinfo): """AMI 별 원하는 크기의 디바이스 매핑 얻기.""" volsize = iinfo.get('vol_size') if volsize is None: return [] imgs = list(ec2.images.filter(ImageIds=[iinfo['ami']])) if len(imgs) == 0: raise ValueError("AMI {} does not exist.".format(iinfo['ami'])) rdev = imgs[0].root_device_name dm = [{"DeviceName": rdev, "Ebs": {"VolumeSize": volsize}}] info("get_root_dm: {}".format(dm)) return dm
def get_cpu_info(tpl, ip): """생성된 인스턴스에서 lscpu 명령으로 CPU 정보 얻기.""" info("get_cpu_info") user = tpl['ssh_user'] private_key = tpl['ssh_private_key'] # Cores cmd = "lscpu | grep -e ^CPU\(s\): | awk '{print $2}'" res, _ = send_instance_cmd(user, private_key, ip, cmd) num_core = int(res[0]) # Threads per core cmd = "lscpu | grep Thread | awk '{print $4}'" res, _ = send_instance_cmd(user, private_key, ip, cmd) threads_per_core = int(res[0]) cpu_info = {'CoreCount': num_core, 'ThreadsPerCore': threads_per_core} return cpu_info
def pause_instance(inst_ids): """인스턴스 정지.""" warning("pause_instance: '{}'".format(inst_ids)) ec2 = boto3.client('ec2') # 권한 확인 try: ec2.stop_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise e # 정지 try: response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: error(str(e))
def check_cluster(clname): """프로파일을 확인. Args: clname (str): 클러스터명 (.json 확장자 제외) """ info(f"check_cluster {clname}") if clname.lower().endswith('.json'): rname = '.'.join(clname.split('.')[0:-1]) msg = "Wrong cluster name '{}'. Use '{}' instead.". \ format(clname, rname) raise NameError(msg) # file existence path = os.path.join(clust_dir, clname + '.json') if not os.path.isfile(path): error("Cluster '{}' does not exist.".format(path)) raise(FileNotFoundError(path)) return path
def _get_run_notebook(path, nb_params, cmd_params=None): assert type(nb_params) in [list, tuple] tname = next(tempfile._get_candidate_names()) tmp = '/tmp/{}'.format(tname) elms = path.split('.') out_path = '.'.join(elms[:-1]) + '.out.' + elms[-1] cmd = "cd {} && ".format(NB_WORKDIR) if cmd_params is not None: assert type(cmd_params) in [list, tuple] for key, value in _iter_run_param(cmd_params): cmd += "{}={} ".format(key, value) cmd += "papermill --cwd {} --no-progress-bar --stdout-file " \ "{} {} {}".format(NB_WORKDIR, tmp, path, out_path) for key, value in _iter_run_param(nb_params): cmd += " -p {} {}".format(key, value) info("_get_run_notebook : {}".format(cmd)) return cmd, tmp
def stop_notebook_or_python(clname, path, params): """실행한 노트북/파이썬 파일을 중단.""" info("stop_notebook_or_python: {} - {}".format(clname, path)) check_cluster(clname) clinfo = load_cluster_info(clname) private_command = clinfo.get('private_command') if 'notebook' not in clinfo: raise RuntimeError("No notebook instance.") ncfg = clinfo['notebook'] user, private_key = ncfg['ssh_user'], ncfg['ssh_private_key'] ip = _get_ip(ncfg, private_command) ext = path.split('.')[-1].lower() dask_scd_addr = _get_dask_scheduler_address(clinfo) # 노트북 파일 if ext == 'ipynb': # Run by papermill _cmd, _ = _get_run_notebook(path, params, [dask_scd_addr]) _cmd = _cmd.replace('papermill', '[p]apermill') # 파이썬 파일 elif ext == 'py': params = list(params) params.insert(0, dask_scd_addr) _cmd = _get_run_python(path, params) _cmd = _cmd.replace('python', '[p]ython') else: raise RuntimeError("Unsupported file type: {}".format(path)) # 실행된 프로세스 찾기 cmd = "ps auxww | grep '{}' | awk '{{print $2}}' | head -n 1".format(_cmd) res, _ = send_instance_cmd(user, private_key, ip, cmd, show_stdout=False, show_stderr=False) # 프로세스가 있으면 삭제 if len(res) > 0: proc = res[0].strip() cmd = "pkill -P {}".format(proc) info("Delete process: {}".format(cmd)) res, _ = send_instance_cmd(user, private_key, ip, cmd, show_stderr=False) else: info("No process exists.")
def run_notebook_or_python(clname, path, params): """원격 노트북 인스턴스에서 노트북 또는 파이썬 파일 실행. Returns: tuple: (stdout, exit_code) """ info("run_notebook_or_python: {} - {}".format(clname, path)) check_cluster(clname) clinfo = load_cluster_info(clname) dask = 'type' in clinfo and clinfo['type'] == 'dask' private_command = clinfo.get('private_command') if 'notebook' not in clinfo['instance']: raise RuntimeError("No notebook instance.") ntpl = clinfo['template']['notebook'] user, private_key = ntpl['ssh_user'], ntpl['ssh_private_key'] nb = clinfo['instance']['notebook'] nip = _get_ip(nb, private_command) ext = path.split('.')[-1].lower() dask_scd_addr = None if dask: if 'scheduler' in clinfo['instance']: dask_scd_addr = _get_dask_scheduler_address(clinfo) else: for param in params: if param.startswith('DASK_SCHEDULER_ADDRESS'): dask_scd_addr = param assert dask_scd_addr is not None, "No Dask scheduler address available." def _check_err(err): err = '\n'.join(err) if 'error' in err.lower(): print(err) return True return False # 노트북 파일 if ext == 'ipynb': cparams = [] if dask: cparams.insert(0, dask_scd_addr) # Run by papermill cmd, tmp = _get_run_notebook(path, params, cparams) res, err, excode = run_cmd_and_store_result(clname, user, private_key, nip, cmd, show_stdout=True, show_stderr=False) if not _check_err(err): cmd = 'cat {}'.format(tmp) res, err = send_instance_cmd(user, private_key, nip, cmd, show_stdout=True, show_stderr=False) if len(err) > 0 and 'No such file' not in err[0]: _check_err(err) # 파이썬 파일 elif ext == 'py': params = list(params) if dask: params.insert(0, dask_scd_addr) cmd = _get_run_python(path, params) res, err, excode = run_cmd_and_store_result(clname, user, private_key, nip, cmd, show_stdout=True) _check_err(err) else: raise RuntimeError("Unsupported file type: {}".format(path)) return res, excode
def start_notebook(clinfo, retry_count=60): """노트북 시작. Args: clinfo (dict): 클러스터 생성 정보 retry_count (int): 접속 URL 얻기 재시도 수. 기본 10 Raises: TimeoutError: 재시도 수가 넘을 때 """ critical("Start notebook.") tpl = clinfo['template']['notebook'] pro = clinfo['profile'] user, private_key = tpl['ssh_user'], tpl['ssh_private_key'] inst = clinfo['instance']['notebook'] ip = _get_ip(inst, pro.get('private_command')) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, ip) # 작업 폴더 nb_workdir = tpl.get('workdir', NB_WORKDIR) cmd = "mkdir -p {}".format(nb_workdir) send_instance_cmd(user, private_key, ip, cmd) # git 설정이 있으면 설정 if 'notebook' in pro and 'git' in pro['notebook']: nb_git = pro['notebook']['git'] cloned_dir = setup_git(nb_git, user, private_key, ip, nb_workdir) clinfo['git_cloned_dir'] = cloned_dir # 클러스터 타입별 노트북 설정 vars = '' if 'type' in clinfo: if clinfo['type'] == 'dask': # dask-labextension을 위한 대쉬보드 URL sip = clinfo['instance']['scheduler']['public_ip'] cmd = "mkdir -p ~/.jupyter/lab/user-settings/dask-labextension; " cmd += 'echo \'{{ "defaultURL": "http://{}:8787" }}\' > ' \ '~/.jupyter/lab/user-settings/dask-labextension/' \ 'plugin.jupyterlab-settings'.format(sip) send_instance_cmd(user, private_key, ip, cmd) # 스케쥴러 주소 vars = _get_dask_scheduler_address(clinfo) else: raise NotImplementedError() # Jupyter 시작 ncmd = "cd {} && {} jupyter lab --ip 0.0.0.0".format(nb_workdir, vars) cmd = "screen -S bilbo -d -m bash -c '{}'".format(ncmd) send_instance_cmd(user, private_key, ip, cmd) # 접속 URL 얻기 cmd = "jupyter notebook list | awk '{print $1}'" for i in range(retry_count): stdouts, _ = send_instance_cmd(user, private_key, ip, cmd) # url을 얻었으면 기록 if len(stdouts) > 1 and len(stdouts[1]) > 0: url = stdouts[1].strip().replace('0.0.0.0', inst['public_ip']) clinfo['notebook_url'] = url return info("Can not fetch notebook list. Wait for a while.") time.sleep(TRY_SLEEP) raise TimeoutError("Can not get notebook url.")
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd, show_stdout=False, show_stderr=True, retry_count=30, get_excode=False): """인스턴스에 SSH 명령어 실행 https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3 Args: ssh_user (str): SSH 유저 ssh_private_key (str): SSH Private Key 경로 ip (str): 대상 인스턴스의 IP cmd (list): 커맨드 문자열 리스트 show_stdout (bool): 표준 출력 메시지 출력 여부 show_stderr (bool): 에러 메시지 출력 여부 retry_count (int): 재시도 횟수 get_excode (bool): exit code 체크 여부. 기본 False Returns: tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는 stdout, stderr. 하는 경우는 stdout, stderr, exit_code """ info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}' .format(ssh_user, ssh_private_key, ip, cmd)) key_path = expanduser(ssh_private_key) key = paramiko.RSAKey.from_private_key_file(key_path) client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) connected = False for i in range(retry_count): try: client.connect(hostname=ip, username=ssh_user, pkey=key) except (paramiko.ssh_exception.NoValidConnectionsError, TimeoutError, BlockingIOError): warning("Connection failed to '{}'. Retry after a while.". format(ip)) time.sleep(TRY_SLEEP) else: connected = True break if not connected: error("Connection failed to '{}'".format(ip)) return stdouts = [] stderrs = [] done_file = '/tmp/bilbo_rcmd_done' if get_excode: # embed exit code file cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}" # 인터랙티브 모드 transport = client.get_transport() transport.set_keepalive(60) channel = transport.open_session() channel.exec_command(cmd) while True: time.sleep(0.1) if channel.recv_ready(): recv = channel.recv(4096).decode('utf-8') stdouts.append(recv) if show_stdout: print(recv, end="") if channel.recv_stderr_ready(): recv = channel.recv_stderr(4096).decode('utf-8') stderrs.append(recv) if channel.exit_status_ready(): break stdouts = ''.join(stdouts).split('\n') stderr = ''.join(stderrs) if show_stderr and len(stderr) > 0: error(stderr) client.close() if get_excode: ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi' out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd) try: excode = int(out[0]) except ValueError: excode = -1 return stdouts, stderr, excode else: return stdouts, stderr