def test_describe_cluster(make_stubber, error_code): emr_client = boto3.client('emr') emr_stubber = make_stubber(emr_client) cluster_id = 'test-id' cluster = {'Id': cluster_id, 'Name': 'cluster-name'} emr_stubber.stub_describe_cluster(cluster_id, cluster, error_code=error_code) if error_code is None: got_cluster = emr_basics.describe_cluster(cluster_id, emr_client) assert got_cluster == cluster else: with pytest.raises(ClientError) as exc_info: emr_basics.describe_cluster(cluster_id, emr_client) assert exc_info.value.response['Error']['Code'] == error_code
def demo_long_lived_cluster(): """ Shows how to create a long-lived cluster that waits after all steps are run so that more steps can be run. At the end of the demo, the cluster is optionally terminated. """ print('-' * 88) print(f"Welcome to the Amazon EMR long-lived cluster demo.") print('-' * 88) prefix = 'aws-demo-long-emr' s3_resource = boto3.resource('s3') iam_resource = boto3.resource('iam') emr_client = boto3.client('emr') ec2_resource = boto3.resource('ec2') # Set up resources for the demo. bucket_name = f'{prefix}-{time.time_ns()}' script_file_name = 'pyspark_top_product_keyword.py' script_key = f'scripts/{script_file_name}' bucket = setup_bucket(bucket_name, script_file_name, script_key, s3_resource) job_flow_role, service_role = \ create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource) security_groups = create_security_groups(prefix, ec2_resource) print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', True, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'WAITING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) add_top_product_step('20', 'Books', 'fire', cluster_id, bucket, script_key, emr_client) add_top_product_step('20', 'Grocery', 'cheese', cluster_id, bucket, script_key, emr_client) review_bucket_folders = s3_resource.meta.client.list_objects_v2( Bucket='amazon-reviews-pds', Prefix='parquet/', Delimiter='/', MaxKeys=100) categories = [ cat['Prefix'].split('=')[1][:-1] for cat in review_bucket_folders['CommonPrefixes'] ] while True: while True: input_cat = input( f"Your turn! Possible categories are: {categories}. Which category " f"would you like to search (enter 'none' when you're done)? ") if input_cat.lower() == 'none' or input_cat in categories: break elif input_cat not in categories: print(f"Sorry, {input_cat} is not an allowed category!") if input_cat.lower() == 'none': break else: input_keyword = input( "What keyword would you like to search for? ") input_count = input("How many items would you like to list? ") add_top_product_step(input_count, input_cat, input_keyword, cluster_id, bucket, script_key, emr_client) # Clean up demo resources (if you want to). remove_everything = input( f"Do you want to terminate the cluster and delete the security roles, " f"groups, bucket, and all of its contents (y/n)? ") if remove_everything.lower() == 'y': emr_basics.terminate_cluster(cluster_id, emr_client) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) delete_bucket(bucket) else: print( f"Remember that running Amazon EMR clusters and objects kept in an " f"Amazon S3 bucket can incur charges against your account.") print("Thanks for watching!")
def demo_short_lived_cluster(): """ Shows how to create a short-lived cluster that runs a step and automatically terminates after the step completes. """ print('-' * 88) print(f"Welcome to the Amazon EMR short-lived cluster demo.") print('-' * 88) prefix = f'aws-demo-short-emr' s3_resource = boto3.resource('s3') iam_resource = boto3.resource('iam') emr_client = boto3.client('emr') ec2_resource = boto3.resource('ec2') # Set up resources for the demo. bucket_name = f'{prefix}-{time.time_ns()}' script_file_name = 'pyspark_estimate_pi.py' script_key = f'scripts/{script_file_name}' bucket = setup_bucket(bucket_name, script_file_name, script_key, s3_resource) job_flow_role, service_role = create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource) security_groups = create_security_groups(prefix, ec2_resource) # Run the job. output_prefix = 'pi-calc-output' pi_step = { 'name': 'estimate-pi-step', 'script_uri': f's3://{bucket_name}/{script_key}', 'script_args': [ '--partitions', '3', '--output_uri', f's3://{bucket_name}/{output_prefix}' ] } print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', False, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [pi_step], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'RUNNING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) status_poller( "Waiting for step to complete...", 'PENDING', lambda: emr_basics. list_steps(cluster_id, emr_client)[0]['Status']['State']) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) print(f"Job complete!. The script, logs, and output for this demo are in " f"Amazon S3 bucket {bucket_name}. The output is:") for obj in bucket.objects.filter(Prefix=output_prefix): print(obj.get()['Body'].read().decode()) # Clean up demo resources (if you want to). remove_everything = input( f"Do you want to delete the security roles, groups, and bucket (y/n)? " ) if remove_everything.lower() == 'y': delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) delete_bucket(bucket) else: print( f"Remember that objects kept in an Amazon S3 bucket can incur charges" f"against your account.") print("Thanks for watching!")
def run(self): # lets start the new job flow, the name is generated by self, as well as log_uri s3_resource = boto3.resource('s3') iam_resource = boto3.resource('iam') emr_client = boto3.client('emr') ec2_resource = boto3.resource('ec2') # set up resources for cluster # we use the first created bucket in user's s3 service bucket_name = "silhouette-rcv1" # here we use the fixed s3 bucket rather that creating a new bucket bucket = s3_resource.Bucket(bucket_name) job_flow_role, service_role = create_roles( f'{self._cluster_name[0:50]}-ec2-role', f'{self._cluster_name[0:50]}-service-role', iam_resource ) secutiry_groups = create_security_groups(self._cluster_name, ec2_resource) script_key = "" output_prefix = f'{self._cluster_name}' # this_step = { # TODO fill the algo config # 'name': 'kmeans', # TODO: here is the algo name # 'script_uri': f's3://{bucket_name}/{script_key}', # 'script_args': # [ # '--partitions', '3', '--output_uri', f's3://{bucket_name}/{output_prefix}' # ] # } time.sleep(10) # sleep for 10 seconds if we have iam control while True: try: self._cluster_id = self.run_job_flow_wrapper( f'{bucket_name}', False, ['Hadoop', 'Spark'], [self._step], job_flow_role, service_role, secutiry_groups, emr_client) print(f"Running job flow for cluster {self._cluster_id}") break except ClientError as error: self._max_retry -= 1 if self._max_retry > 0 and \ error.response['Error']['Code'] == 'ValidationException': print("Instance profile is not ready, wait for another 10 seconds") time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'RUNNING', lambda: emr_basics.describe_cluster(self._cluster_id, emr_client)['Status']['State'], ) status_poller( "Waiting for step to complete...", 'PENDING', lambda: emr_basics.list_steps(self._cluster_id, emr_client)[0]['Status']['State']) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster(self._cluster_id, emr_client)['Status']['State'] ) print(f"Job complete!. The script, logs, and output for this demo are in " f"Amazon S3 bucket {bucket_name}. The output is:") for obj in bucket.objects.filter(Prefix=output_prefix): print(obj.get()['Body'].read().decode()) # delete roles and security groups # delete_security_groups(secutiry_groups) # delete_roles([job_flow_role, service_role]) # logger.info("Deleting security groupd and roles") # Get the time of running, we choose to retrieve the stdout of ec2 instance from logs # naming: s3://{bucket_name}/logs/{self._cluster_name}/{cluster_id}/containers/application_160410 # application_1604063314445_0001/container_1604063314445_0001_01_000001/stdout.gz self.fetchResult()
def demo_short_lived_cluster(PREFIX, S3_BUCKET, S3_KEY, LOCAL_SCRIPT_KEY, REGION, AWS_ACESS_KEY, AWS_SECRET): """ Create a short-lived cluster that runs a step and automatically terminates after the step completes. :param PREFIX: The prefix to use in the EMR cluster and security groups creation. :param S3_BUCKET: The name of the S3 Bucket used. :param S3_KEY: The key where the PySpark script will be uploaded. :param LOCAL_SCRIPT_KEY: The key where the PySpark script is alocated locally. """ print('-' * 88) print(f"Welcome to the Amazon EMR short-lived cluster.") print('-' * 88) #prefix = f'CSGO-PIPELINE-EMR-CLUSTER' prefix = PREFIX s3_resource = boto3.resource('s3', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) iam_resource = boto3.resource('iam', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) emr_client = boto3.client('emr', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) ec2_resource = boto3.resource('ec2', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) S3_URI = 's3://{bucket}/{key}'.format(bucket=S3_BUCKET, key=S3_KEY) s3_resource.meta.client.upload_file(LOCAL_SCRIPT_KEY, S3_BUCKET, S3_KEY) # Set up resources. bucket_name = S3_BUCKET #get the local time named_tuple = time.localtime() time_string = time.strftime("%m.%d.%Y-%Hh%Mm%Ss", named_tuple) job_flow_role, service_role = create_roles( f'{time_string}-{prefix}-ec2-role', f'{time_string}-{prefix}-service-role', iam_resource) security_groups = create_security_groups(f'{time_string}-{prefix}', ec2_resource) # Run the job. step = {'name': 'pyspark_test', 'script_uri': S3_URI, 'script_args': []} print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', False, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [step], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'RUNNING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) status_poller( "Waiting for step to complete...", 'PENDING', lambda: emr_basics. list_steps(cluster_id, emr_client)[0]['Status']['State']) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) print(f"Job complete!. The script, logs, and output are in " f"Amazon S3 bucket {bucket_name}/logs.") #steps = emr_basics.list_steps(cluster_id, emr_client) #for step in steps: # print(emr_basics.describe_step(cluster_id, step, emr_client)) delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) #if __name__ == '__main__': # logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # demo_short_lived_cluster("CS_GO_PIPELINE", "fpmacedo", "spark/pyspark_script.py", "pyspark_script.py")