def resume_instance(inst_ids, ec2): """인스턴스 재개.""" warning("resume_instance: '{}'".format(inst_ids)) # 권한 확인 try: ec2.start_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise # 재개 while True: try: response = ec2.start_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: msg = str(e) if 'is not in a state' not in msg: error(msg) raise time.sleep(5) else: break
def _wait_until_running(inst): info(f"_wait_until_running: {inst}") try: inst.wait_until_running() inst.load() except Exception as e: error(str(e)) raise e return inst
def start_dask_cluster(clinfo): """Dask 클러스터 마스터/워커를 시작.""" critical("Start dask scheduler & workers.") private_command = clinfo.get('private_command') # 스케쥴러 시작 stpl = clinfo['template']['scheduler'] user, private_key = stpl['ssh_user'], stpl['ssh_private_key'] scd = clinfo['instance']['scheduler'] sip = _get_ip(scd, private_command) scd_dns = scd['private_dns_name'] cmd = "screen -S bilbo -d -m dask-scheduler" send_instance_cmd(user, private_key, sip, cmd) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, sip) # 워커 실행 옵션 구하기 wrks = clinfo['instance']['workers'] wip = _get_ip(wrks[0], private_command) info(" Get worker memory from '{}'".format(wip)) cmd = "free -b | grep 'Mem:' | awk '{print $2}'" stdouts, _ = send_instance_cmd(user, private_key, wip, cmd) memory = int(stdouts[0]) wtpl = clinfo['template']['worker'] nproc, nthread, memory = dask_worker_options(wtpl, memory) # 결정된 옵션 기록 wtpl = clinfo['template']['worker'] wtpl['nproc'] = nproc wtpl['nthread'] = nthread wtpl['memory'] = memory # 모든 워커들에 대해 user, private_key = wtpl['ssh_user'], wtpl['ssh_private_key'] for wrk in wrks: wip = _get_ip(wrk, private_command) # AWS 크레덴셜 설치 setup_aws_creds(user, private_key, wip) # 워커 시작 opts = "--nprocs {} --nthreads {} --memory-limit {}".\ format(nproc, nthread, memory) cmd = "screen -S bilbo -d -m dask-worker {}:8786 {}".\ format(scd_dns, opts) warning(" Worker options: {}".format(opts)) send_instance_cmd(user, private_key, wip, cmd) # Dask 스케쥴러의 대쉬보드 기다림 dash_url = 'http://{}:8787'.format(sip) clinfo['dask_dashboard_url'] = dash_url critical("Wait for Dask dashboard ready.") try: wait_until_connect(dash_url) except Exception as e: error(str(e)) raise e
def open_notebook(clname, url_only=False): """노트북 열기.""" warning(f"open_notebook {clname}") check_cluster(clname) clinfo = load_cluster_info(clname) if 'notebook_url' in clinfo: url = clinfo['notebook_url'] if url_only: print(url) else: open_url(url, clinfo) else: error("no notebook instance.") raise Exception("No notebook instance.")
def create_ec2_instances(ec2, tpl, cnt, tag_spec): """EC2 인스턴스 생성.""" rdm = get_root_dm(ec2, tpl) try: ins = ec2.create_instances(ImageId=tpl['ami'], InstanceType=tpl['ec2type'], MinCount=cnt, MaxCount=cnt, KeyName=tpl['keyname'], BlockDeviceMappings=rdm, SecurityGroupIds=[tpl['security_group']], TagSpecifications=tag_spec) return ins except botocore.exceptions.ClientError as e: error("create_ec2_instances - {}".format(str(e))) if 'Request would have succeeded' not in str(e): raise e
def check_profile(proname): """프로파일을 확인. Args: proname (str): 프로파일명 (.json 확장자 포함) """ if not proname.lower().endswith('.json'): msg = "Wrong profile name '{}'. Use '{}.json' instead.". \ format(proname, proname) raise NameError(msg) # file existence path = os.path.join(prof_dir, proname) if not os.path.isfile(path): error("Profile '{}' does not exist.".format(path)) raise (FileNotFoundError(path)) return path
def pause_instance(inst_ids): """인스턴스 정지.""" warning("pause_instance: '{}'".format(inst_ids)) ec2 = boto3.client('ec2') # 권한 확인 try: ec2.stop_instances(InstanceIds=inst_ids, DryRun=True) except botocore.exceptions.ClientError as e: if 'DryRunOperation' not in str(e): error(str(e)) raise e # 정지 try: response = ec2.stop_instances(InstanceIds=inst_ids, DryRun=False) info(response) except botocore.exceptions.ClientError as e: error(str(e))
def check_cluster(clname): """프로파일을 확인. Args: clname (str): 클러스터명 (.json 확장자 제외) """ info(f"check_cluster {clname}") if clname.lower().endswith('.json'): rname = '.'.join(clname.split('.')[0:-1]) msg = "Wrong cluster name '{}'. Use '{}' instead.". \ format(clname, rname) raise NameError(msg) # file existence path = os.path.join(clust_dir, clname + '.json') if not os.path.isfile(path): error("Cluster '{}' does not exist.".format(path)) raise(FileNotFoundError(path)) return path
def send_instance_cmd(ssh_user, ssh_private_key, ip, cmd, show_stdout=False, show_stderr=True, retry_count=30, get_excode=False): """인스턴스에 SSH 명령어 실행 https://stackoverflow.com/questions/42645196/how-to-ssh-and-run-commands-in-ec2-using-boto3 Args: ssh_user (str): SSH 유저 ssh_private_key (str): SSH Private Key 경로 ip (str): 대상 인스턴스의 IP cmd (list): 커맨드 문자열 리스트 show_stdout (bool): 표준 출력 메시지 출력 여부 show_stderr (bool): 에러 메시지 출력 여부 retry_count (int): 재시도 횟수 get_excode (bool): exit code 체크 여부. 기본 False Returns: tuple: send_command 함수의 결과. get_excode 를 하지 않는 경우는 stdout, stderr. 하는 경우는 stdout, stderr, exit_code """ info('send_instance_cmd - user: {}, key: {}, ip {}, cmd {}' .format(ssh_user, ssh_private_key, ip, cmd)) key_path = expanduser(ssh_private_key) key = paramiko.RSAKey.from_private_key_file(key_path) client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) connected = False for i in range(retry_count): try: client.connect(hostname=ip, username=ssh_user, pkey=key) except (paramiko.ssh_exception.NoValidConnectionsError, TimeoutError, BlockingIOError): warning("Connection failed to '{}'. Retry after a while.". format(ip)) time.sleep(TRY_SLEEP) else: connected = True break if not connected: error("Connection failed to '{}'".format(ip)) return stdouts = [] stderrs = [] done_file = '/tmp/bilbo_rcmd_done' if get_excode: # embed exit code file cmd = f"rm -f {done_file} && " + cmd + f" ; echo $? > {done_file}" # 인터랙티브 모드 transport = client.get_transport() transport.set_keepalive(60) channel = transport.open_session() channel.exec_command(cmd) while True: time.sleep(0.1) if channel.recv_ready(): recv = channel.recv(4096).decode('utf-8') stdouts.append(recv) if show_stdout: print(recv, end="") if channel.recv_stderr_ready(): recv = channel.recv_stderr(4096).decode('utf-8') stderrs.append(recv) if channel.exit_status_ready(): break stdouts = ''.join(stdouts).split('\n') stderr = ''.join(stderrs) if show_stderr and len(stderr) > 0: error(stderr) client.close() if get_excode: ccmd = f'if [ -f {done_file} ]; then cat {done_file}; fi' out, _= send_instance_cmd(ssh_user, ssh_private_key, ip, ccmd) try: excode = int(out[0]) except ValueError: excode = -1 return stdouts, stderr, excode else: return stdouts, stderr