Exemple #1
0
    def execute(self, context: 'Context') -> str:
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(
                self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow creation failed: {response}')
        else:
            job_flow_id = response['JobFlowId']
            self.log.info('JobFlow with id %s created', job_flow_id)
            EmrClusterLink.persist(
                context=context,
                operator=self,
                region_name=emr.conn_region_name,
                aws_partition=emr.conn_partition,
                job_flow_id=job_flow_id,
            )
            return job_flow_id
Exemple #2
0
    def test_create_job_flow_uses_the_emr_config_to_create_a_cluster(self):
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        cluster = hook.create_job_flow({'Name': 'test_cluster'})

        self.assertEqual(client.list_clusters()['Clusters'][0]['Id'],
                         cluster['JobFlowId'])
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)
        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
Exemple #4
0
    def test_get_cluster_id_by_name(self):
        """
        Test that we can resolve cluster id by cluster name.
        """
        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')

        job_flow = hook.create_job_flow({'Name': 'test_cluster',
                                         'Instances': {'KeepJobFlowAliveWhenNoSteps': True}})

        job_flow_id = job_flow['JobFlowId']

        matching_cluster = hook.get_cluster_id_by_name('test_cluster', ['RUNNING', 'WAITING'])

        self.assertEqual(matching_cluster, job_flow_id)

        no_match = hook.get_cluster_id_by_name('foo', ['RUNNING', 'WAITING', 'BOOTSTRAPPING'])

        self.assertIsNone(no_match)
Exemple #5
0
    def test_create_job_flow_extra_args(self):
        """
        Test that we can add extra arguments to the launch call.

        This is useful for when AWS add new options, such as
        "SecurityConfiguration" so that we don't have to change our code
        """
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        # AmiVersion is really old and almost no one will use it anymore, but
        # it's one of the "optional" request params that moto supports - it's
        # coverage of EMR isn't 100% it turns out.
        cluster = hook.create_job_flow({'Name': 'test_cluster',
                                        'ReleaseLabel': '',
                                        'AmiVersion': '3.2'})

        cluster = client.describe_cluster(ClusterId=cluster['JobFlowId'])['Cluster']

        # The AmiVersion comes back as {Requested,Running}AmiVersion fields.
        self.assertEqual(cluster['RequestedAmiVersion'], '3.2')
Exemple #6
0
    def execute(self, context: Dict[str, Any]) -> str:
        emr = EmrHook(
            aws_conn_id=self.aws_conn_id, emr_conn_id=self.emr_conn_id, region_name=self.region_name
        )

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s', self.aws_conn_id, self.emr_conn_id
        )

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
Exemple #7
0
    def execute(self, context):
        # define hooks
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name='us-west-2')

        self.log.info('Defining JobFlow...')

        SPARK_STEPS = [{
            'Name': "copy-files-" + time.strftime("%Y%m%d-%H:%M"),
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    "aws", "s3", "cp",
                    f"s3://{Variable.get('S3_CODE_BUCKET_NAME')}/",
                    "/home/hadoop/", "--recursive"
                ]
            }
        }, {
            'Name': 'run-etl-' + time.strftime("%Y%m%d-%H:%M"),
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    'spark-submit', '/home/hadoop/spark_etl.py', '-q',
                    Variable.get('S3_RAW_QUOTES_BUCKET_NAME'), '-op',
                    Variable.get('S3_RAW_OPTIONS_BUCKET_NAME'), '-d',
                    context['ds_nodash'], '-o',
                    Variable.get('S3_DATA_BUCKET_NAME')
                ]
            }
        }]

        s_s32log = 's3n://aws-logs-345196100842-us-west-2/elasticmapreduce/'
        self.job_flow_overrides = {
            'Name': 'etl-process',
            'LogUri': s_s32log,
            'ReleaseLabel': 'emr-5.20.0',
            'Instances': {
                'InstanceGroups': [{
                    "InstanceCount": 2,
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [{
                            "VolumeSpecification": {
                                "SizeInGB": 32,
                                "VolumeType": "gp2"
                            },
                            "VolumesPerInstance": 1
                        }]
                    },
                    'Market': 'SPOT',
                    "InstanceRole": "CORE",
                    "InstanceType": "m5.xlarge",
                    "Name": "Core Instance Group"
                }, {
                    "InstanceCount": 1,
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [{
                            "VolumeSpecification": {
                                "SizeInGB": 32,
                                "VolumeType": "gp2"
                            },
                            "VolumesPerInstance": 1
                        }]
                    },
                    'Market': 'SPOT',
                    "InstanceRole": "MASTER",
                    "InstanceType": "m5.xlarge",
                    "Name": "Master Instance Group"
                }],
                'KeepJobFlowAliveWhenNoSteps':
                False,
                'TerminationProtected':
                False,
            },
            'Steps': SPARK_STEPS,
            'JobFlowRole': 'EMR_EC2_DefaultRole',
            'ServiceRole': 'EMR_DefaultRole',
        }

        self.log.info('Creating JobFlow...')

        if isinstance(self.job_flow_overrides, str):
            self.job_flow_overrides = ast.literal_eval(self.job_flow_overrides)

        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']