コード例 #1
0
ファイル: emr_traffic.py プロジェクト: Arinzeokeke/reddit
 def __init__(self, log_path, output_path):
     self.log_path = log_path
     self.output_path = output_path
     self.name = '%s (%s)' % (self.STEP_NAME, self.log_path)
     pig_args = ['-p', 'OUTPUT=%s' % self.output_path,
                 '-p', 'LOGFILE=%s' % self.log_path]
     PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)
コード例 #2
0
 def __init__(self, log_path, output_path):
     self.log_path = log_path
     self.output_path = output_path
     self.name = '%s (%s)' % (self.STEP_NAME, self.log_path)
     pig_args = [
         '-p',
         'OUTPUT=%s' % self.output_path, '-p',
         'LOGFILE=%s' % self.log_path
     ]
     PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)
コード例 #3
0
 def emr_execute_pig(self, pig_filename):
     from boto.emr.step import PigStep
     s3_pig_script = self.s3_upload(pig_filename)
     pig_step = PigStep(name=self.get_emr_job_name(),
                        pig_file=s3_pig_script)
     self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[pig_step])
     emr_wait_job(self.emr_conn, self.job_flow_id)
コード例 #4
0
    def add_pig_step(self,
                     jobflow_id,
                     pig_file,
                     name='Pig Script',
                     pig_versions='latest',
                     pig_args=[]):

        pig_step = PigStep(
            name=name,
            pig_file=pig_file,
            pig_versions=pig_versions,
            pig_args=pig_args,
            # action_on_failure='CONTINUE',
        )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working
        return self._poll_until_cluster_ready(jobflow_id)
コード例 #5
0
pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig'
INPUT = 's3://elasticmapreduce/samples/pig-apache/input/'
OUTPUT = ('s3://org.unencrypted.emr.output/apache_sample/%s' %
          datetime.datetime.utcnow().strftime("%s"))

print """\
Running pig job with settings:

    SCRIPT={script}
    INPUT={input}
    OUPUT={output}
""".format(script=pig_file, input=INPUT, output=OUTPUT)

pig_args = ['-p', 'INPUT=%s' % INPUT,
            '-p', 'OUTPUT=%s' % OUTPUT]

pig_step = PigStep('Process Reports', pig_file, pig_args=pig_args)
steps = [InstallPigStep(), pig_step]

job_id = conn.run_jobflow(
    name='sample apache report',
    ec2_keyname=os.getenv("EC2_KEY_NAME"),
    steps=steps,
    log_uri="s3://org.unencrypted.emr.log/sampleflow_logs",
    enable_debugging=True,
    ami_version="latest",
    instance_groups=instance_groups,
    keep_alive=True)

print job_id
コード例 #6
0
ファイル: clusters.py プロジェクト: dkuner/pyDataCanvas
 def emr_execute_pig(self, job_name, s3_pig_script):
     pig_step = PigStep(name=job_name, pig_file=s3_pig_script)
     pig_step.action_on_failure = 'CONTINUE'
     ret_steps = self.emr_conn.add_jobflow_steps(self.jobflow_id, steps=[pig_step])
     step_ids = [s.value for s in ret_steps.stepids]
     return step_ids
コード例 #7
0
ファイル: emr_traffic.py プロジェクト: rprz/reddit
 def __init__(self, input_path, output_path):
     self.input_path = input_path
     self.output_path = output_path
     self.name = "%s (%s)" % (self.STEP_NAME, self.input_path)
     pig_args = ["-p", "INPUT=%s" % self.input_path, "-p", "OUTPUT=%s" % self.output_path]
     PigStep.__init__(self, self.name, self.PIG_FILE, pig_args=pig_args)