def __init__(self, log_path, output_path): self.log_path = log_path self.output_path = output_path self.name = '%s (%s)' % (self.STEP_NAME, self.log_path) pig_args = ['-p', 'OUTPUT=%s' % self.output_path, '-p', 'LOGFILE=%s' % self.log_path] PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)
def __init__(self, log_path, output_path): self.log_path = log_path self.output_path = output_path self.name = '%s (%s)' % (self.STEP_NAME, self.log_path) pig_args = [ '-p', 'OUTPUT=%s' % self.output_path, '-p', 'LOGFILE=%s' % self.log_path ] PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)
def emr_execute_pig(self, pig_filename): from boto.emr.step import PigStep s3_pig_script = self.s3_upload(pig_filename) pig_step = PigStep(name=self.get_emr_job_name(), pig_file=s3_pig_script) self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[pig_step]) emr_wait_job(self.emr_conn, self.job_flow_id)
def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): pig_step = PigStep( name=name, pig_file=pig_file, pig_versions=pig_versions, pig_args=pig_args, # action_on_failure='CONTINUE', ) self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step]) # Poll until the cluster is done working return self._poll_until_cluster_ready(jobflow_id)
pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig' INPUT = 's3://elasticmapreduce/samples/pig-apache/input/' OUTPUT = ('s3://org.unencrypted.emr.output/apache_sample/%s' % datetime.datetime.utcnow().strftime("%s")) print """\ Running pig job with settings: SCRIPT={script} INPUT={input} OUPUT={output} """.format(script=pig_file, input=INPUT, output=OUTPUT) pig_args = ['-p', 'INPUT=%s' % INPUT, '-p', 'OUTPUT=%s' % OUTPUT] pig_step = PigStep('Process Reports', pig_file, pig_args=pig_args) steps = [InstallPigStep(), pig_step] job_id = conn.run_jobflow( name='sample apache report', ec2_keyname=os.getenv("EC2_KEY_NAME"), steps=steps, log_uri="s3://org.unencrypted.emr.log/sampleflow_logs", enable_debugging=True, ami_version="latest", instance_groups=instance_groups, keep_alive=True) print job_id
def emr_execute_pig(self, job_name, s3_pig_script): pig_step = PigStep(name=job_name, pig_file=s3_pig_script) pig_step.action_on_failure = 'CONTINUE' ret_steps = self.emr_conn.add_jobflow_steps(self.jobflow_id, steps=[pig_step]) step_ids = [s.value for s in ret_steps.stepids] return step_ids
def __init__(self, input_path, output_path): self.input_path = input_path self.output_path = output_path self.name = "%s (%s)" % (self.STEP_NAME, self.input_path) pig_args = ["-p", "INPUT=%s" % self.input_path, "-p", "OUTPUT=%s" % self.output_path] PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)