Esempio n. 1
0
def test_describe_cluster(make_stubber, error_code):
    emr_client = boto3.client('emr')
    emr_stubber = make_stubber(emr_client)
    cluster_id = 'test-id'
    cluster = {'Id': cluster_id, 'Name': 'cluster-name'}

    emr_stubber.stub_describe_cluster(cluster_id,
                                      cluster,
                                      error_code=error_code)

    if error_code is None:
        got_cluster = emr_basics.describe_cluster(cluster_id, emr_client)
        assert got_cluster == cluster
    else:
        with pytest.raises(ClientError) as exc_info:
            emr_basics.describe_cluster(cluster_id, emr_client)
        assert exc_info.value.response['Error']['Code'] == error_code
def demo_long_lived_cluster():
    """
    Shows how to create a long-lived cluster that waits after all steps are run so
    that more steps can be run. At the end of the demo, the cluster is optionally
    terminated.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR long-lived cluster demo.")
    print('-' * 88)

    prefix = 'aws-demo-long-emr'

    s3_resource = boto3.resource('s3')
    iam_resource = boto3.resource('iam')
    emr_client = boto3.client('emr')
    ec2_resource = boto3.resource('ec2')

    # Set up resources for the demo.
    bucket_name = f'{prefix}-{time.time_ns()}'
    script_file_name = 'pyspark_top_product_keyword.py'
    script_key = f'scripts/{script_file_name}'
    bucket = setup_bucket(bucket_name, script_file_name, script_key,
                          s3_resource)
    job_flow_role, service_role = \
        create_roles(f'{prefix}-ec2-role', f'{prefix}-service-role', iam_resource)
    security_groups = create_security_groups(prefix, ec2_resource)
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)

    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', True,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise
    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'WAITING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )

    add_top_product_step('20', 'Books', 'fire', cluster_id, bucket, script_key,
                         emr_client)

    add_top_product_step('20', 'Grocery', 'cheese', cluster_id, bucket,
                         script_key, emr_client)

    review_bucket_folders = s3_resource.meta.client.list_objects_v2(
        Bucket='amazon-reviews-pds',
        Prefix='parquet/',
        Delimiter='/',
        MaxKeys=100)
    categories = [
        cat['Prefix'].split('=')[1][:-1]
        for cat in review_bucket_folders['CommonPrefixes']
    ]
    while True:
        while True:
            input_cat = input(
                f"Your turn! Possible categories are: {categories}. Which category "
                f"would you like to search (enter 'none' when you're done)? ")
            if input_cat.lower() == 'none' or input_cat in categories:
                break
            elif input_cat not in categories:
                print(f"Sorry, {input_cat} is not an allowed category!")
        if input_cat.lower() == 'none':
            break
        else:
            input_keyword = input(
                "What keyword would you like to search for? ")
            input_count = input("How many items would you like to list? ")
            add_top_product_step(input_count, input_cat, input_keyword,
                                 cluster_id, bucket, script_key, emr_client)

    # Clean up demo resources (if you want to).
    remove_everything = input(
        f"Do you want to terminate the cluster and delete the security roles, "
        f"groups, bucket, and all of its contents (y/n)? ")
    if remove_everything.lower() == 'y':
        emr_basics.terminate_cluster(cluster_id, emr_client)
        status_poller(
            "Waiting for cluster to terminate.",
            'TERMINATED', lambda: emr_basics.describe_cluster(
                cluster_id, emr_client)['Status']['State'])
        delete_security_groups(security_groups)
        delete_roles([job_flow_role, service_role])
        delete_bucket(bucket)
    else:
        print(
            f"Remember that running Amazon EMR clusters and objects kept in an "
            f"Amazon S3 bucket can incur charges against your account.")
    print("Thanks for watching!")
def demo_short_lived_cluster():
    """
    Shows how to create a short-lived cluster that runs a step and automatically
    terminates after the step completes.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR short-lived cluster demo.")
    print('-' * 88)

    prefix = f'aws-demo-short-emr'

    s3_resource = boto3.resource('s3')
    iam_resource = boto3.resource('iam')
    emr_client = boto3.client('emr')
    ec2_resource = boto3.resource('ec2')

    # Set up resources for the demo.
    bucket_name = f'{prefix}-{time.time_ns()}'
    script_file_name = 'pyspark_estimate_pi.py'
    script_key = f'scripts/{script_file_name}'
    bucket = setup_bucket(bucket_name, script_file_name, script_key,
                          s3_resource)
    job_flow_role, service_role = create_roles(f'{prefix}-ec2-role',
                                               f'{prefix}-service-role',
                                               iam_resource)
    security_groups = create_security_groups(prefix, ec2_resource)

    # Run the job.
    output_prefix = 'pi-calc-output'
    pi_step = {
        'name':
        'estimate-pi-step',
        'script_uri':
        f's3://{bucket_name}/{script_key}',
        'script_args': [
            '--partitions', '3', '--output_uri',
            f's3://{bucket_name}/{output_prefix}'
        ]
    }
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)
    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', False,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [pi_step], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise

    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'RUNNING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )
    status_poller(
        "Waiting for step to complete...", 'PENDING', lambda: emr_basics.
        list_steps(cluster_id, emr_client)[0]['Status']['State'])
    status_poller(
        "Waiting for cluster to terminate.",
        'TERMINATED', lambda: emr_basics.describe_cluster(
            cluster_id, emr_client)['Status']['State'])

    print(f"Job complete!. The script, logs, and output for this demo are in "
          f"Amazon S3 bucket {bucket_name}. The output is:")
    for obj in bucket.objects.filter(Prefix=output_prefix):
        print(obj.get()['Body'].read().decode())

    # Clean up demo resources (if you want to).
    remove_everything = input(
        f"Do you want to delete the security roles, groups, and bucket (y/n)? "
    )
    if remove_everything.lower() == 'y':
        delete_security_groups(security_groups)
        delete_roles([job_flow_role, service_role])
        delete_bucket(bucket)
    else:
        print(
            f"Remember that objects kept in an Amazon S3 bucket can incur charges"
            f"against your account.")
    print("Thanks for watching!")
    def run(self):
        # lets start the new job flow, the name is generated by self, as well as log_uri
        s3_resource = boto3.resource('s3')
        iam_resource = boto3.resource('iam')
        emr_client = boto3.client('emr')
        ec2_resource = boto3.resource('ec2')
        # set up resources for cluster
        # we use the first created bucket in user's s3 service
        bucket_name = "silhouette-rcv1"  # here we use the fixed s3 bucket rather that creating a new bucket
        bucket = s3_resource.Bucket(bucket_name)
        job_flow_role, service_role = create_roles(
            f'{self._cluster_name[0:50]}-ec2-role', f'{self._cluster_name[0:50]}-service-role', iam_resource
        )
        secutiry_groups = create_security_groups(self._cluster_name, ec2_resource)
        script_key = ""
        output_prefix = f'{self._cluster_name}'
        #  this_step = {  # TODO fill the algo config
        #      'name': 'kmeans',  # TODO: here is the algo name
        #      'script_uri': f's3://{bucket_name}/{script_key}',
        #      'script_args':
        #          [
        #              '--partitions', '3', '--output_uri', f's3://{bucket_name}/{output_prefix}'
        #          ]
        #  }
        time.sleep(10)  # sleep for 10 seconds if we have iam control
        while True:
            try:
                self._cluster_id = self.run_job_flow_wrapper(
                    f'{bucket_name}',
                    False, ['Hadoop', 'Spark'], [self._step],
                    job_flow_role, service_role, secutiry_groups, emr_client)
                print(f"Running job flow for cluster {self._cluster_id}")
                break
            except ClientError as error:
                self._max_retry -= 1
                if self._max_retry > 0 and \
                        error.response['Error']['Code'] == 'ValidationException':
                    print("Instance profile is not ready, wait for another 10 seconds")
                    time.sleep(10)
                else:
                    raise

        status_poller(
            "Waiting for cluster, this typically takes several minutes...",
            'RUNNING',
            lambda: emr_basics.describe_cluster(self._cluster_id, emr_client)['Status']['State'],
        )
        status_poller(
            "Waiting for step to complete...",
            'PENDING',
            lambda: emr_basics.list_steps(self._cluster_id, emr_client)[0]['Status']['State'])
        status_poller(
            "Waiting for cluster to terminate.",
            'TERMINATED',
            lambda: emr_basics.describe_cluster(self._cluster_id, emr_client)['Status']['State']
        )
        print(f"Job complete!. The script, logs, and output for this demo are in "
              f"Amazon S3 bucket {bucket_name}. The output is:")
        for obj in bucket.objects.filter(Prefix=output_prefix):
            print(obj.get()['Body'].read().decode())

        # delete roles and security groups
        # delete_security_groups(secutiry_groups)
        # delete_roles([job_flow_role, service_role])
        # logger.info("Deleting security groupd and roles")

        # Get the time of running, we choose to retrieve the stdout of ec2 instance from logs
        # naming: s3://{bucket_name}/logs/{self._cluster_name}/{cluster_id}/containers/application_160410
        #  application_1604063314445_0001/container_1604063314445_0001_01_000001/stdout.gz
        self.fetchResult()
Esempio n. 5
0
def demo_short_lived_cluster(PREFIX, S3_BUCKET, S3_KEY, LOCAL_SCRIPT_KEY,
                             REGION, AWS_ACESS_KEY, AWS_SECRET):
    """
    Create a short-lived cluster that runs a step and automatically
    terminates after the step completes.

    :param PREFIX: The prefix to use in the EMR cluster and security groups creation.
    :param S3_BUCKET: The name of the S3 Bucket used.
    :param S3_KEY: The key where the PySpark script will be uploaded.
    :param LOCAL_SCRIPT_KEY: The key where the PySpark script is alocated locally.
    """
    print('-' * 88)
    print(f"Welcome to the Amazon EMR short-lived cluster.")
    print('-' * 88)

    #prefix = f'CSGO-PIPELINE-EMR-CLUSTER'
    prefix = PREFIX

    s3_resource = boto3.resource('s3',
                                 region_name=REGION,
                                 aws_access_key_id=AWS_ACESS_KEY,
                                 aws_secret_access_key=AWS_SECRET)

    iam_resource = boto3.resource('iam',
                                  region_name=REGION,
                                  aws_access_key_id=AWS_ACESS_KEY,
                                  aws_secret_access_key=AWS_SECRET)

    emr_client = boto3.client('emr',
                              region_name=REGION,
                              aws_access_key_id=AWS_ACESS_KEY,
                              aws_secret_access_key=AWS_SECRET)

    ec2_resource = boto3.resource('ec2',
                                  region_name=REGION,
                                  aws_access_key_id=AWS_ACESS_KEY,
                                  aws_secret_access_key=AWS_SECRET)

    S3_URI = 's3://{bucket}/{key}'.format(bucket=S3_BUCKET, key=S3_KEY)

    s3_resource.meta.client.upload_file(LOCAL_SCRIPT_KEY, S3_BUCKET, S3_KEY)

    # Set up resources.
    bucket_name = S3_BUCKET

    #get the local time
    named_tuple = time.localtime()
    time_string = time.strftime("%m.%d.%Y-%Hh%Mm%Ss", named_tuple)

    job_flow_role, service_role = create_roles(
        f'{time_string}-{prefix}-ec2-role',
        f'{time_string}-{prefix}-service-role', iam_resource)

    security_groups = create_security_groups(f'{time_string}-{prefix}',
                                             ec2_resource)

    # Run the job.
    step = {'name': 'pyspark_test', 'script_uri': S3_URI, 'script_args': []}
    print(
        "Wait for 10 seconds to give roles and profiles time to propagate...")
    time.sleep(10)
    max_tries = 5
    while True:
        try:
            cluster_id = emr_basics.run_job_flow(
                f'{prefix}-cluster', f's3://{bucket_name}/logs', False,
                ['Hadoop', 'Hive', 'Spark'], job_flow_role, service_role,
                security_groups, [step], emr_client)
            print(f"Running job flow for cluster {cluster_id}...")
            break
        except ClientError as error:
            max_tries -= 1
            if max_tries > 0 and \
                    error.response['Error']['Code'] == 'ValidationException':
                print(
                    "Instance profile is not ready, let's give it more time..."
                )
                time.sleep(10)
            else:
                raise

    status_poller(
        "Waiting for cluster, this typically takes several minutes...",
        'RUNNING',
        lambda: emr_basics.describe_cluster(cluster_id, emr_client)['Status'][
            'State'],
    )
    status_poller(
        "Waiting for step to complete...", 'PENDING', lambda: emr_basics.
        list_steps(cluster_id, emr_client)[0]['Status']['State'])
    status_poller(
        "Waiting for cluster to terminate.",
        'TERMINATED', lambda: emr_basics.describe_cluster(
            cluster_id, emr_client)['Status']['State'])

    print(f"Job complete!. The script, logs, and output are in "
          f"Amazon S3 bucket {bucket_name}/logs.")

    #steps = emr_basics.list_steps(cluster_id, emr_client)

    #for step in steps:
    #   print(emr_basics.describe_step(cluster_id, step, emr_client))

    delete_security_groups(security_groups)
    delete_roles([job_flow_role, service_role])


#if __name__ == '__main__':
#    logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
#    demo_short_lived_cluster("CS_GO_PIPELINE", "fpmacedo", "spark/pyspark_script.py", "pyspark_script.py")