def main(): args_dict = parse_cli_args(create_parser()) print("Args: ", args_dict) numeric_level = getattr(logging, args_dict['log_level'], None) logging.basicConfig(format=LOGFORMAT) logger.setLevel(numeric_level) client = boto3.client('emr', region_name=args_dict['aws_region']) s3 = boto3.resource('s3') cluster_id = args_dict.get('cluster_id') if cluster_id is None: logger.info("Launching cluster...") ec2_client = boto3.client('ec2', region_name=args_dict['aws_region']) pricing_client = boto3.client('pricing', region_name=args_dict['aws_region']) args_dict = determine_prices(args_dict, ec2_client, pricing_client) cluster_config = cluster.emr_config(**args_dict) response = client.run_job_flow(**cluster_config) cluster_id = response['JobFlowId'] logger.info("Cluster ID: %s", cluster_id) emr_steps = steps.setup_steps(s3, args_dict['s3_bucket'], args_dict['app'], args_dict['submit_args'], args_dict['app_args'], args_dict['uploads'], args_dict['s3_dist_cp']) response = client.add_job_flow_steps(JobFlowId=cluster_id, Steps=emr_steps) try: step_ids = json.dumps(response['StepIds']) except KeyError: step_ids = 'Invalid response' logger.info("Step IDs: %s", step_ids)
def main(): parser = create_parser() args = parser.parse_args() print("Args: ", args) numeric_level = getattr(logging, args.log_level, None) logging.basicConfig(format=LOGFORMAT) logger.setLevel(numeric_level) client = boto3.client('emr', region_name=args.aws_region) s3 = boto3.resource('s3') cluster_id = args.cluster_id if cluster_id is None: logger.info("Launching cluster...") args_dict = vars(args) if args.dynamic_pricing: ec2 = boto3.client('ec2', region_name=args.aws_region) bid_px, is_spot = pricing.get_bid_price(ec2, args.slave) args_dict['bid_price'] = str(bid_px) if is_spot: logger.info("Using spot pricing with bid price $%d", bid_px) else: logger.info("Spot price too high. Using on-demand %d", bid_px) cluster_config = cluster.emr_config(**args_dict) response = client.run_job_flow(**cluster_config) cluster_id = response['JobFlowId'] logger.info("Cluster ID: %s", cluster_id) emr_steps = steps.setup_steps(s3, args.s3_bucket, args.app, args.submit_args, args.app_args, args.uploads, args.s3_dist_cp) client.add_job_flow_steps(JobFlowId=cluster_id, Steps=emr_steps)
def test_emr_cluster_config_with_defaults(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', keep_alive=False, instance_type_core='m4.2xlarge', instance_type_task='m4.2xlarge', num_core=1, num_task=1, bid_price_task='0.1', name="Test SparkSteps", defaults=['spark-defaults', 'spark.speculation=false', 'yarn-site', 'yarn.nodemanager.vmem-check-enabled=true']) print(config['Configurations']) assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'ON_DEMAND', 'Name': 'Master Node'}, {'InstanceCount': 1, 'InstanceRole': 'CORE', 'InstanceType': 'm4.2xlarge', 'Market': 'ON_DEMAND', 'Name': 'Core Nodes'}, {'BidPrice': '0.1', 'InstanceCount': 1, 'InstanceRole': 'TASK', 'InstanceType': 'm4.2xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes'}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 'Configurations': [ { 'Classification': 'spark-defaults', 'Properties': { 'spark.speculation': 'false' } }, { 'Classification': 'yarn-site', 'Properties': { 'yarn.nodemanager.vmem-check-enabled': 'true' } } ], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole'} client = boto3.client('emr', region_name=AWS_REGION_NAME) client.run_job_flow(**config)
def test_emr_cluster_config(): config = emr_config('emr-5.2.0', master='m4.large', keep_alive=False, slave='m4.2xlarge', num_core=1, num_task=1, bid_price='0.1', name="Test SparkSteps") assert config == { 'Instances': { 'InstanceGroups': [ { 'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'ON_DEMAND', 'Name': 'Master Node' }, { 'InstanceCount': 1, 'InstanceRole': 'CORE', 'InstanceType': 'm4.2xlarge', 'Market': 'ON_DEMAND', 'Name': 'Core Nodes' }, { 'BidPrice': '0.1', 'InstanceCount': 1, 'InstanceRole': 'TASK', 'InstanceType': 'm4.2xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes' } ], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{ 'Name': 'Hadoop' }, { 'Name': 'Spark' }], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole' } client = boto3.client('emr', region_name=AWS_REGION_NAME) client.run_job_flow(**config)
def test_is_step_complete(emr_client, s3_client): """ Ensure is_step_complete returns expected boolean value """ cluster_config = emr_config('emr-5.2.0', instance_type_master='m4.large', jobflow_role='MyCustomRole', keep_alive=False, instance_type_core='m4.2xlarge', instance_type_task='m4.2xlarge', num_core=1, num_task=1, bid_price_task='0.1', maximize_resource_allocation=True, name='Test SparkSteps', app_list=['hadoop', 'hive', 'spark']) response = emr_client.run_job_flow(**cluster_config) cluster_id = response['JobFlowId'] test_step = { 'Name': 'test-step', 'ActionOnFailure': 'CANCEL_AND_WAIT', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['state-pusher-script'] } } response = emr_client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[test_step]) last_step_id = response['StepIds'][-1] # while the step state is non-terminal is_step_complete should return False for state in ['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING']: set_step_state(last_step_id, cluster_id, state) assert not is_step_complete(emr_client, cluster_id, last_step_id), \ 'Expected last step to not be complete when step state is {}'.format(state) # when last step is in a terminal state (completed), is_step_complete should return True set_step_state(last_step_id, cluster_id, 'COMPLETED') assert is_step_complete(emr_client, cluster_id, last_step_id), \ 'Expected last step to be complete when last step state is {}'.format('COMPLETED') # when last step is in a failed state, is_step_complete should raise a helpful exception for state in ['CANCELLED', 'FAILED', 'INTERRUPTED']: set_step_state(last_step_id, cluster_id, state) try: is_step_complete(emr_client, cluster_id, last_step_id) assert False, \ 'Expected an exception to be raised when the last step is in {} state'.format(state) except Exception as e: assert 'EMR job failed' == str( e), 'Exception message not as expected'
def main(): args_dict = parse_cli_args(create_parser()) print("Args: ", args_dict) numeric_level = getattr(logging, args_dict['log_level'], None) logging.basicConfig(format=LOGFORMAT) logging.getLogger('sparksteps').setLevel(numeric_level) client = boto3.client('emr', region_name=args_dict['aws_region']) s3 = boto3.resource('s3') cluster_id = args_dict.get('cluster_id') if cluster_id is None: logger.info("Launching cluster...") ec2_client = boto3.client('ec2', region_name=args_dict['aws_region']) pricing_client = boto3.client('pricing', region_name=args_dict['aws_region']) args_dict = determine_prices(args_dict, ec2_client, pricing_client) cluster_config = cluster.emr_config(**args_dict) response = client.run_job_flow(**cluster_config) cluster_id = response['JobFlowId'] logger.info("Cluster ID: %s", cluster_id) emr_steps = steps.setup_steps(s3, args_dict['s3_bucket'], args_dict['s3_path'], args_dict['app'], args_dict['submit_args'], args_dict['app_args'], args_dict['uploads'], args_dict['s3_dist_cp']) response = client.add_job_flow_steps(JobFlowId=cluster_id, Steps=emr_steps) try: step_ids = json.dumps(response['StepIds']) except KeyError: step_ids = 'Invalid response' args_dict['wait'] = False logger.info("Step IDs: %s", step_ids) sleep_interval = args_dict.get('wait') if sleep_interval: last_step_id = response['StepIds'][-1] logger.info( 'Polling until step {last_step} is complete using a sleep interval of {interval} seconds...' .format(last_step=last_step_id, interval=sleep_interval)) wait_for_step_complete(client, cluster_id, last_step_id, sleep_interval_s=int(sleep_interval))
def test_emr_cluster_config(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', jobflow_role='MyCustomRole', service_role='MyServiceRole', keep_alive=False, instance_type_core='m4.2xlarge', instance_type_task='m4.2xlarge', num_core=1, num_task=1, bid_price_task='0.1', maximize_resource_allocation=True, name="Test SparkSteps", app_list=['hadoop', 'hive', 'spark']) assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'ON_DEMAND', 'Name': 'Master Node'}, {'InstanceCount': 1, 'InstanceRole': 'CORE', 'InstanceType': 'm4.2xlarge', 'Market': 'ON_DEMAND', 'Name': 'Core Nodes'}, {'BidPrice': '0.1', 'InstanceCount': 1, 'InstanceRole': 'TASK', 'InstanceType': 'm4.2xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes'}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Hive'}, {'Name': 'Spark'}], 'Name': 'Test SparkSteps', 'JobFlowRole': 'MyCustomRole', 'ServiceRole': 'MyServiceRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'Configurations': [{'Classification': 'spark', 'Properties': {'maximizeResourceAllocation': 'true'}}] } client = boto3.client('emr', region_name=AWS_REGION_NAME) client.run_job_flow(**config)
def test_emr_cluster_config(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', keep_alive=False, instance_type_core='m4.2xlarge', instance_type_task='m4.2xlarge', num_core=1, num_task=1, bid_price_task='0.1', maximize_resource_allocation=True, name="Test SparkSteps") assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'ON_DEMAND', 'Name': 'Master Node'}, {'InstanceCount': 1, 'InstanceRole': 'CORE', 'InstanceType': 'm4.2xlarge', 'Market': 'ON_DEMAND', 'Name': 'Core Nodes'}, {'BidPrice': '0.1', 'InstanceCount': 1, 'InstanceRole': 'TASK', 'InstanceType': 'm4.2xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes'}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole', 'Configurations': [{'Classification': 'spark', 'Properties': {'maximizeResourceAllocation': 'true'}}] } client = boto3.client('emr', region_name=AWS_REGION_NAME) client.run_job_flow(**config)
def test_emr_spot_cluster(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', keep_alive=False, instance_type_core='c3.8xlarge', instance_type_task='c3.8xlarge', num_core=2, num_task=4, bid_price_master='0.05', bid_price_core='0.25', bid_price_task='0.1', name="Test SparkSteps", bootstrap_script='s3://bucket/bootstrap-actions.sh') assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'SPOT', 'BidPrice': '0.05', 'Name': 'Master Node'}, {'BidPrice': '0.25', 'InstanceCount': 2, 'InstanceRole': 'CORE', 'InstanceType': 'c3.8xlarge', 'Market': 'SPOT', 'Name': 'Core Nodes'}, {'BidPrice': '0.1', 'InstanceCount': 4, 'InstanceRole': 'TASK', 'InstanceType': 'c3.8xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes'}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 'BootstrapActions': [{'Name': 'bootstrap', 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole'}
def test_emr_cluster_config_with_bootstrap(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', keep_alive=False, instance_type_core='m4.2xlarge', instance_type_task='m4.2xlarge', num_core=1, num_task=1, bid_price_task='0.1', name="Test SparkSteps", bootstrap_script='s3://bucket/bootstrap-actions.sh') assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'ON_DEMAND', 'Name': 'Master Node'}, {'InstanceCount': 1, 'InstanceRole': 'CORE', 'InstanceType': 'm4.2xlarge', 'Market': 'ON_DEMAND', 'Name': 'Core Nodes'}, {'BidPrice': '0.1', 'InstanceCount': 1, 'InstanceRole': 'TASK', 'InstanceType': 'm4.2xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes'}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 'BootstrapActions': [{'Name': 'bootstrap', 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole'} client = boto3.client('emr', region_name=AWS_REGION_NAME) client.run_job_flow(**config)
def test_emr_ebs_storage(): config = emr_config('emr-5.2.0', instance_type_master='m4.large', keep_alive=False, instance_type_core='c3.8xlarge', instance_type_task='c3.8xlarge', ebs_volume_size_core=100, ebs_volume_type_core='gp2', ebs_volumes_per_core=2, ebs_volume_size_task=10, ebs_volume_type_task='io1', ebs_optimized_task=True, num_core=2, num_task=4, bid_price_master='0.05', bid_price_core='0.25', bid_price_task='0.1', name="Test SparkSteps", bootstrap_script='s3://bucket/bootstrap-actions.sh') assert config == {'Instances': {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 'InstanceRole': 'MASTER', 'InstanceType': 'm4.large', 'Market': 'SPOT', 'BidPrice': '0.05', 'Name': 'Master Node'}, {'BidPrice': '0.25', 'InstanceCount': 2, 'InstanceRole': 'CORE', 'InstanceType': 'c3.8xlarge', 'Market': 'SPOT', 'Name': 'Core Nodes', 'EbsConfiguration': { 'EbsBlockDeviceConfigs': [{ 'VolumeSpecification': { 'VolumeType': 'gp2', 'SizeInGB': 100 }, 'VolumesPerInstance': 2 }], 'EbsOptimized': False }}, {'BidPrice': '0.1', 'InstanceCount': 4, 'InstanceRole': 'TASK', 'InstanceType': 'c3.8xlarge', 'Market': 'SPOT', 'Name': 'Task Nodes', 'EbsConfiguration': { 'EbsBlockDeviceConfigs': [{ 'VolumeSpecification': { 'VolumeType': 'io1', 'SizeInGB': 10 }, 'VolumesPerInstance': 1 }], 'EbsOptimized': True }}], 'KeepJobFlowAliveWhenNoSteps': False, 'TerminationProtected': False }, 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 'BootstrapActions': [{'Name': 'bootstrap', 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 'Name': 'Test SparkSteps', 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ReleaseLabel': 'emr-5.2.0', 'VisibleToAllUsers': True, 'ServiceRole': 'EMR_DefaultRole'}