예제 #1
0
 def emr_execute_hive(self, job_name, s3_hive_script):
     hive_step = HiveStep(name=job_name, hive_file=s3_hive_script)
     hive_step.action_on_failure = 'CONTINUE'
     ret_steps = self.emr_conn.add_jobflow_steps(self.jobflow_id,
                                                 steps=[hive_step])
     step_ids = [s.value for s in ret_steps.stepids]
     return step_ids
예제 #2
0
    def run(self):
        """Run the Hive job on EMR cluster
        """
        #  copy the data source to a new object
        #  (Hive deletes/moves the original)
        copy_s3_file(self.input_path, self.data_path)

        # and create the hive script
        self._generate_and_upload_hive_script()

        logger.info("Waiting {} seconds for S3 eventual consistency".format(
            self.s3_sync_wait_time))
        time.sleep(self.s3_sync_wait_time)

        # TODO more options like setting aws region
        conn = EmrConnection(self.aws_access_key_id,
                             self.aws_secret_access_key)

        setup_step = InstallHiveStep(self.hive_version)
        run_step = HiveStep(self.job_name, self.script_path)

        cluster_id = conn.run_jobflow(
            self.job_name,
            self.log_path,
            action_on_failure='CANCEL_AND_WAIT',
            master_instance_type=self.master_instance_type,
            slave_instance_type=self.slave_instance_type,
            ami_version=self.ami_version,
            num_instances=self.num_instances,
            job_flow_role=self.iam_instance_profile,
            service_role=self.iam_service_role)

        conn.add_jobflow_steps(cluster_id, [setup_step, run_step])

        logger.info("Job started on cluster {0}".format(cluster_id))

        self._wait_for_job_to_complete(conn, cluster_id)

        logger.info("Output file is in: {0}".format(self.output_path))
예제 #3
0
 def emr_execute_hive(self, s3_hive_script):
     from boto.emr.step import HiveStep
     hive_step = HiveStep(name=self.get_emr_job_name(), hive_file=s3_hive_script)
     self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[hive_step])
     emr_wait_job(self.emr_conn, self.job_flow_id)
예제 #4
0
 def emr_execute_hive(self, job_name, s3_hive_script):
     hive_step = HiveStep(name=job_name, hive_file=s3_hive_script)
     hive_step.action_on_failure = 'CONTINUE'
     ret_steps = self.emr_conn.add_jobflow_steps(self.jobflow_id, steps=[hive_step])
     step_ids = [s.value for s in ret_steps.stepids]
     return step_ids