def test_run_job_flow(make_stubber, make_unique_name, error_code): emr_client = boto3.client('emr') emr_stubber = make_stubber(emr_client) cluster_name = make_unique_name('cluster-') log_uri = 's3://test-bucket' release = 'emr-5.30.1' instance_type = 'm5.xlarge' instance_count = 3 keep_alive = True steps = [{ 'name': make_unique_name('step-'), 'script_uri': 's3://test-bucket', 'script_args': ('--testing', ) }] applications = ['test-app'] cluster_id = 'i-123456789' job_flow_role = MagicMock() job_flow_role.name = 'job-flow-role' service_role = MagicMock() service_role.name = 'service_role' security_groups = \ {'manager': MagicMock(id='sg-1234'), 'worker': MagicMock(id='sg-5678')} emr_stubber.stub_run_job_flow(cluster_name, log_uri, release, instance_type, instance_count, keep_alive, steps, applications, job_flow_role.name, service_role.name, security_groups, cluster_id, error_code=error_code) if error_code is None: got_id = emr_basics.run_job_flow(cluster_name, log_uri, keep_alive, applications, job_flow_role, service_role, security_groups, steps, emr_client) assert got_id == cluster_id else: with pytest.raises(ClientError) as exc_info: emr_basics.run_job_flow(cluster_name, log_uri, keep_alive, applications, job_flow_role, service_role, security_groups, steps, emr_client) assert exc_info.value.response['Error']['Code'] == error_code
def demo_long_lived_cluster(): """ Shows how to create a long-lived cluster that waits after all steps are run so that more steps can be run. At the end of the demo, the cluster is optionally terminated. """ print('-' * 88) print(f"Welcome to the Amazon EMR long-lived cluster demo.") print('-' * 88) prefix = 'aws-demo-long-emr' s3_resource = boto3.resource('s3') iam_resource = boto3.resource('iam') emr_client = boto3.client('emr') ec2_resource = boto3.resource('ec2') # Set up resources for the demo. bucket_name = f'{prefix}-{time.time_ns()}' script_file_name = 'pyspark_top_product_keyword.py' script_key = f'scripts/{script_file_name}' bucket = setup_bucket(bucket_name, script_file_name, script_key, s3_resource) job_flow_role, service_role = \ create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource) security_groups = create_security_groups(prefix, ec2_resource) print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', True, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'WAITING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) add_top_product_step('20', 'Books', 'fire', cluster_id, bucket, script_key, emr_client) add_top_product_step('20', 'Grocery', 'cheese', cluster_id, bucket, script_key, emr_client) review_bucket_folders = s3_resource.meta.client.list_objects_v2( Bucket='amazon-reviews-pds', Prefix='parquet/', Delimiter='/', MaxKeys=100) categories = [ cat['Prefix'].split('=')[1][:-1] for cat in review_bucket_folders['CommonPrefixes'] ] while True: while True: input_cat = input( f"Your turn! Possible categories are: {categories}. Which category " f"would you like to search (enter 'none' when you're done)? ") if input_cat.lower() == 'none' or input_cat in categories: break elif input_cat not in categories: print(f"Sorry, {input_cat} is not an allowed category!") if input_cat.lower() == 'none': break else: input_keyword = input( "What keyword would you like to search for? ") input_count = input("How many items would you like to list? ") add_top_product_step(input_count, input_cat, input_keyword, cluster_id, bucket, script_key, emr_client) # Clean up demo resources (if you want to). remove_everything = input( f"Do you want to terminate the cluster and delete the security roles, " f"groups, bucket, and all of its contents (y/n)? ") if remove_everything.lower() == 'y': emr_basics.terminate_cluster(cluster_id, emr_client) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) delete_bucket(bucket) else: print( f"Remember that running Amazon EMR clusters and objects kept in an " f"Amazon S3 bucket can incur charges against your account.") print("Thanks for watching!")
def demo_short_lived_cluster(): """ Shows how to create a short-lived cluster that runs a step and automatically terminates after the step completes. """ print('-' * 88) print(f"Welcome to the Amazon EMR short-lived cluster demo.") print('-' * 88) prefix = f'aws-demo-short-emr' s3_resource = boto3.resource('s3') iam_resource = boto3.resource('iam') emr_client = boto3.client('emr') ec2_resource = boto3.resource('ec2') # Set up resources for the demo. bucket_name = f'{prefix}-{time.time_ns()}' script_file_name = 'pyspark_estimate_pi.py' script_key = f'scripts/{script_file_name}' bucket = setup_bucket(bucket_name, script_file_name, script_key, s3_resource) job_flow_role, service_role = create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource) security_groups = create_security_groups(prefix, ec2_resource) # Run the job. output_prefix = 'pi-calc-output' pi_step = { 'name': 'estimate-pi-step', 'script_uri': f's3://{bucket_name}/{script_key}', 'script_args': [ '--partitions', '3', '--output_uri', f's3://{bucket_name}/{output_prefix}' ] } print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', False, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [pi_step], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'RUNNING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) status_poller( "Waiting for step to complete...", 'PENDING', lambda: emr_basics. list_steps(cluster_id, emr_client)[0]['Status']['State']) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) print(f"Job complete!. The script, logs, and output for this demo are in " f"Amazon S3 bucket {bucket_name}. The output is:") for obj in bucket.objects.filter(Prefix=output_prefix): print(obj.get()['Body'].read().decode()) # Clean up demo resources (if you want to). remove_everything = input( f"Do you want to delete the security roles, groups, and bucket (y/n)? " ) if remove_everything.lower() == 'y': delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) delete_bucket(bucket) else: print( f"Remember that objects kept in an Amazon S3 bucket can incur charges" f"against your account.") print("Thanks for watching!")
def demo_short_lived_cluster(PREFIX, S3_BUCKET, S3_KEY, LOCAL_SCRIPT_KEY, REGION, AWS_ACESS_KEY, AWS_SECRET): """ Create a short-lived cluster that runs a step and automatically terminates after the step completes. :param PREFIX: The prefix to use in the EMR cluster and security groups creation. :param S3_BUCKET: The name of the S3 Bucket used. :param S3_KEY: The key where the PySpark script will be uploaded. :param LOCAL_SCRIPT_KEY: The key where the PySpark script is alocated locally. """ print('-' * 88) print(f"Welcome to the Amazon EMR short-lived cluster.") print('-' * 88) #prefix = f'CSGO-PIPELINE-EMR-CLUSTER' prefix = PREFIX s3_resource = boto3.resource('s3', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) iam_resource = boto3.resource('iam', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) emr_client = boto3.client('emr', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) ec2_resource = boto3.resource('ec2', region_name=REGION, aws_access_key_id=AWS_ACESS_KEY, aws_secret_access_key=AWS_SECRET) S3_URI = 's3://{bucket}/{key}'.format(bucket=S3_BUCKET, key=S3_KEY) s3_resource.meta.client.upload_file(LOCAL_SCRIPT_KEY, S3_BUCKET, S3_KEY) # Set up resources. bucket_name = S3_BUCKET #get the local time named_tuple = time.localtime() time_string = time.strftime("%m.%d.%Y-%Hh%Mm%Ss", named_tuple) job_flow_role, service_role = create_roles( f'{time_string}-{prefix}-ec2-role', f'{time_string}-{prefix}-service-role', iam_resource) security_groups = create_security_groups(f'{time_string}-{prefix}', ec2_resource) # Run the job. step = {'name': 'pyspark_test', 'script_uri': S3_URI, 'script_args': []} print( "Wait for 10 seconds to give roles and profiles time to propagate...") time.sleep(10) max_tries = 5 while True: try: cluster_id = emr_basics.run_job_flow( f'{prefix}-cluster', f's3://{bucket_name}/logs', False, ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role, security_groups, [step], emr_client) print(f"Running job flow for cluster {cluster_id}...") break except ClientError as error: max_tries -= 1 if max_tries > 0 and \ error.response['Error']['Code'] == 'ValidationException': print( "Instance profile is not ready, let's give it more time..." ) time.sleep(10) else: raise status_poller( "Waiting for cluster, this typically takes several minutes...", 'RUNNING', lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][ 'State'], ) status_poller( "Waiting for step to complete...", 'PENDING', lambda: emr_basics. list_steps(cluster_id, emr_client)[0]['Status']['State']) status_poller( "Waiting for cluster to terminate.", 'TERMINATED', lambda: emr_basics.describe_cluster( cluster_id, emr_client)['Status']['State']) print(f"Job complete!. The script, logs, and output are in " f"Amazon S3 bucket {bucket_name}/logs.") #steps = emr_basics.list_steps(cluster_id, emr_client) #for step in steps: # print(emr_basics.describe_step(cluster_id, step, emr_client)) delete_security_groups(security_groups) delete_roles([job_flow_role, service_role]) #if __name__ == '__main__': # logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') # demo_short_lived_cluster("CS_GO_PIPELINE", "fpmacedo", "spark/pyspark_script.py", "pyspark_script.py")