def show_cluster(clname, detail=False): """클러스터 정보를 표시.""" warning(f"show_cluster {clname}") check_cluster(clname) if detail: clinfo = load_cluster_info(clname) pprint(clinfo) return clinfo = load_cluster_info(clname) print() print("Cluster Name: {}".format(clinfo['name'])) print("Ready Time: {}".format(clinfo['saved_time'])) insts = clinfo['instance'] idx = 1 if 'notebook' in insts: print() print("Notebook:") idx = show_instance(idx, insts['notebook']) print() if 'type' in clinfo: cltype = clinfo['type'] print("Cluster Type: {}".format(cltype)) if cltype == 'dask': show_dask_cluster(idx, clinfo) else: raise NotImplementedError() print()
def git_clone_cmd(repo, user, passwd, workdir): """Git 클론 명령 구성""" warning("git clone: {}".format(repo)) protocol, address = repo.split('://') url = "{}://{}:{}@{}".format(protocol, user, passwd, address) cmd = "cd {} && git clone {}".format(workdir, url) return cmd
def destroy_cluster(clname, force): """클러스터 제거.""" warning(f"destroy_cluster {clname}") check_cluster(clname) clinfo = load_cluster_info(clname) if 'git_cloned_dir' in clinfo and not force: if not check_git_modified(clinfo): print("Canceled.") return critical("Destroy cluster '{}'.".format(clname)) # 인스턴스 제거 ec2 = boto3.client('ec2') inst_ids = [] for k, v in clinfo['instance'].items(): if k == 'workers': inst_ids += [w['instance_id'] for w in v] else: inst_ids.append(v['instance_id']) if len(inst_ids) > 0: ec2.terminate_instances(InstanceIds=inst_ids) # 클러스터 파일 제거 path = os.path.join(clust_dir, clname + '.json') os.unlink(path)
def resume_instance(inst_ids, ec2): """인스턴스 재개.""" warning("resume_instance: '{}'".format(inst_ids)) # 권한 확인 try: ec2.start_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise # 재개 while True: try: response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: msg = str(e) if 'is not in a state' not in msg: error(msg) raise time.sleep(5) else: break
def init_instances(clinfo): """인스턴스 공통 초기화.""" warning("init_instances") tpl = clinfo['template'] insts = clinfo['instance'] # 초기화 명령 def _run_init_cmd(role): rtpl = tpl[role] user, private_key = rtpl['ssh_user'], rtpl['ssh_private_key'] if 'init_cmd' in rtpl: cmds = rtpl['init_cmd'] if role == 'worker': for winst in insts['workers']: ip = winst['public_ip'] _send_cmd(user, private_key, ip, cmds) else: ip = insts[role]['public_ip'] _send_cmd(user, private_key, ip, cmds) def _send_cmd(user, private_key, ip, cmds): for cmd in cmds: send_instance_cmd(user, private_key, ip, cmd) if 'notebook' in tpl: _run_init_cmd('notebook') if 'scheduler' in tpl: _run_init_cmd('scheduler') if 'worker' in tpl: _run_init_cmd('worker')
def load_cluster_info(clname): """클러스터 정보파일 읽기.""" warning("load_cluster_info: '{}'".format(clname)) path = os.path.join(clust_dir, clname + '.json') with open(path, 'rt') as f: body = f.read() clinfo = json.loads(body) return clinfo
def start_dask_cluster(clinfo): """Dask 클러스터 마스터/워커를 시작.""" critical("Start dask scheduler & workers.") private_command = clinfo.get('private_command') # 스케쥴러 시작 stpl = clinfo['template']['scheduler'] user, private_key = stpl['ssh_user'], stpl['ssh_private_key'] scd = clinfo['instance']['scheduler'] sip = _get_ip(scd, private_command) scd_dns = scd['private_dns_name'] cmd = "screen -S bilbo -d -m dask-scheduler" send_instance_cmd(user, private_key, sip, cmd) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, sip) # 워커 실행 옵션 구하기 wrks = clinfo['instance']['workers'] wip = _get_ip(wrks[0], private_command) info(" Get worker memory from '{}'".format(wip)) cmd = "free -b | grep 'Mem:' | awk '{print $2}'" stdouts, _ = send_instance_cmd(user, private_key, wip, cmd) memory = int(stdouts[0]) wtpl = clinfo['template']['worker'] nproc, nthread, memory = dask_worker_options(wtpl, memory) # 결정된 옵션 기록 wtpl = clinfo['template']['worker'] wtpl['nproc'] = nproc wtpl['nthread'] = nthread wtpl['memory'] = memory # 모든 워커들에 대해 user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key'] for wrk in wrks: wip = _get_ip(wrk, private_command) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, wip) # 워커 시작 opts = "--nprocs {} --nthreads {} --memory-limit {}".\ format(nproc, nthread, memory) cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\ format(scd_dns, opts) warning(" Worker options: {}".format(opts)) send_instance_cmd(user, private_key, wip, cmd) # Dask 스케쥴러의 대쉬보드 기다림 dash_url = 'http://{}:8787'.format(sip) clinfo['dask_dashboard_url'] = dash_url critical("Wait for Dask dashboard ready.") try: wait_until_connect(dash_url) except Exception as e: error(str(e)) raise e
def open_notebook(clname, url_only=False): """노트북 열기.""" warning(f"open_notebook {clname}") check_cluster(clname) clinfo = load_cluster_info(clname) if 'notebook_url' in clinfo: url = clinfo['notebook_url'] if url_only: print(url) else: open_url(url, clinfo) else: error("no notebook instance.") raise Exception("No notebook instance.")
def save_cluster_info(clinfo): """클러스터 정보파일 쓰기.""" clname = clinfo['name'] def json_default(value): if isinstance(value, datetime.date): return value.strftime('%Y-%m-%d %H:%M:%S') raise TypeError('not JSON serializable') warning("save_cluster_info: '{}'".format(clname)) clinfo['saved_time'] = str(datetime.datetime.now()) path = os.path.join(clust_dir, clname + '.json') with open(path, 'wt') as f: body = json.dumps(clinfo, default=json_default, indent=4, sort_keys=True, ensure_ascii=False) f.write(body)
def pause_instance(inst_ids): """인스턴스 정지.""" warning("pause_instance: '{}'".format(inst_ids)) ec2 = boto3.client('ec2') # 권한 확인 try: ec2.stop_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise e # 정지 try: response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: error(str(e))
def _update_cluster_info(ec2, clname, inst_ids, clinfo): # 정보 갱신 대기 while True: ready = True warning("Wait until available.") time.sleep(5) res = ec2.describe_instances(InstanceIds=inst_ids) for inst in res['Reservations'][0]['Instances']: if 'PublicIpAddress' not in inst: ready = False print(inst) break if ready: break # 바뀐 정보 갱신 for reserv in res['Reservations']: inst = reserv['Instances'][0] insts = clinfo['instance'] if 'notebook' in insts: nb = insts['notebook'] if nb['instance_id'] == inst['InstanceId']: new_ip = inst['PublicIpAddress'] nb['public_ip'] = new_ip if 'scheduler' in insts: scd = insts['scheduler'] if scd['instance_id'] == inst['InstanceId']: scd['public_ip'] = inst['PublicIpAddress'] if 'workers' in insts: wrks = insts['workers'] for wrk in wrks: if wrk['instance_id'] == inst['InstanceId']: wrk['public_ip'] = inst['PublicIpAddress'] save_cluster_info(clinfo) return clinfo
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd, show_stdout=False, show_stderr=True, retry_count=30, get_excode=False): """인스턴스에 SSH 명령어 실행 https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3 Args: ssh_user (str): SSH 유저 ssh_private_key (str): SSH Private Key 경로 ip (str): 대상 인스턴스의 IP cmd (list): 커맨드 문자열 리스트 show_stdout (bool): 표준 출력 메시지 출력 여부 show_stderr (bool): 에러 메시지 출력 여부 retry_count (int): 재시도 횟수 get_excode (bool): exit code 체크 여부. 기본 False Returns: tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는 stdout, stderr. 하는 경우는 stdout, stderr, exit_code """ info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}' .format(ssh_user, ssh_private_key, ip, cmd)) key_path = expanduser(ssh_private_key) key = paramiko.RSAKey.from_private_key_file(key_path) client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) connected = False for i in range(retry_count): try: client.connect(hostname=ip, username=ssh_user, pkey=key) except (paramiko.ssh_exception.NoValidConnectionsError, TimeoutError, BlockingIOError): warning("Connection failed to '{}'. Retry after a while.". format(ip)) time.sleep(TRY_SLEEP) else: connected = True break if not connected: error("Connection failed to '{}'".format(ip)) return stdouts = [] stderrs = [] done_file = '/tmp/bilbo_rcmd_done' if get_excode: # embed exit code file cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}" # 인터랙티브 모드 transport = client.get_transport() transport.set_keepalive(60) channel = transport.open_session() channel.exec_command(cmd) while True: time.sleep(0.1) if channel.recv_ready(): recv = channel.recv(4096).decode('utf-8') stdouts.append(recv) if show_stdout: print(recv, end="") if channel.recv_stderr_ready(): recv = channel.recv_stderr(4096).decode('utf-8') stderrs.append(recv) if channel.exit_status_ready(): break stdouts = ''.join(stdouts).split('\n') stderr = ''.join(stderrs) if show_stderr and len(stderr) > 0: error(stderr) client.close() if get_excode: ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi' out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd) try: excode = int(out[0]) except ValueError: excode = -1 return stdouts, stderr, excode else: return stdouts, stderr