def emr_execute_hive(self, job_name, s3_hive_script): hive_step = HiveStep(name=job_name, hive_file=s3_hive_script) hive_step.action_on_failure = 'CONTINUE' ret_steps = self.emr_conn.add_jobflow_steps(self.jobflow_id, steps=[hive_step]) step_ids = [s.value for s in ret_steps.stepids] return step_ids
def run(self): """Run the Hive job on EMR cluster """ # copy the data source to a new object # (Hive deletes/moves the original) copy_s3_file(self.input_path, self.data_path) # and create the hive script self._generate_and_upload_hive_script() logger.info("Waiting {} seconds for S3 eventual consistency".format( self.s3_sync_wait_time)) time.sleep(self.s3_sync_wait_time) # TODO more options like setting aws region conn = EmrConnection(self.aws_access_key_id, self.aws_secret_access_key) setup_step = InstallHiveStep(self.hive_version) run_step = HiveStep(self.job_name, self.script_path) cluster_id = conn.run_jobflow( self.job_name, self.log_path, action_on_failure='CANCEL_AND_WAIT', master_instance_type=self.master_instance_type, slave_instance_type=self.slave_instance_type, ami_version=self.ami_version, num_instances=self.num_instances, job_flow_role=self.iam_instance_profile, service_role=self.iam_service_role) conn.add_jobflow_steps(cluster_id, [setup_step, run_step]) logger.info("Job started on cluster {0}".format(cluster_id)) self._wait_for_job_to_complete(conn, cluster_id) logger.info("Output file is in: {0}".format(self.output_path))
def emr_execute_hive(self, s3_hive_script): from boto.emr.step import HiveStep hive_step = HiveStep(name=self.get_emr_job_name(), hive_file=s3_hive_script) self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[hive_step]) emr_wait_job(self.emr_conn, self.job_flow_id)