def run_hive_jobs(start_dt, end_dt): jobname = "Daily Exercise Stats (%s to %s)" % (start_dt, end_dt) jobflow = emr.create_hive_cluster(jobname, {}) # TODO(jace): make sure the required data (ProblemLogs, etc) # is available before running these downstream summaries day = datetime.datetime.strptime(end_dt, DATE_FORMAT) start_day = datetime.datetime.strptime(start_dt, DATE_FORMAT) day -= datetime.timedelta(days=1) # end date is exclusive while day >= start_day: emr.add_hive_step(jobflow, {}, 's3://ka-mapreduce/code/hive/daily_ex_stats.q', script_args={ "dt": day.strftime(DATE_FORMAT), "branch": "/" }) day -= datetime.timedelta(days=1) return jobflow, emr.wait_for_completion(jobflow)
def run_hive_jobs(start_dt, end_dt): jobname = "Daily Exercise Stats (%s to %s)" % (start_dt, end_dt) jobflow = emr.create_hive_cluster(jobname, {}) # TODO(jace): make sure the required data (ProblemLogs, etc) # is available before running these downstream summaries day = datetime.datetime.strptime(end_dt, DATE_FORMAT) start_day = datetime.datetime.strptime(start_dt, DATE_FORMAT) day -= datetime.timedelta(days=1) # end date is exclusive while day >= start_day: emr.add_hive_step( jobflow, {}, "s3://ka-mapreduce/code/hive/daily_ex_stats.q", script_args={"dt": day.strftime(DATE_FORMAT), "branch": "/"}, ) day -= datetime.timedelta(days=1) return jobflow, emr.wait_for_completion(jobflow)
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step["hive_args"]) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step.get("hive_args", {})) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def run_hive_jobs(start_dt, end_dt, earliest_dt): jobname = "Growth Reporting (%s to %s)" % (start_dt, end_dt) jobflow = emr.create_hive_cluster(jobname, {}) # TODO(jace): make sure the required data (ProblemLogs, etc) # is available before running these downstream summaries emr.add_hive_step(jobflow, {}, hive_script='s3://ka-mapreduce/code/hive/user_daily_activity.q', script_args={"start_dt": start_dt, "end_dt": end_dt}) emr.add_hive_step(jobflow, {}, hive_script='s3://ka-mapreduce/code/hive/user_growth.q', script_args={"start_dt": earliest_dt, "end_dt": end_dt}) emr.add_hive_step(jobflow, {}, hive_script='s3://ka-mapreduce/code/hive/company_metrics.q', script_args={"start_dt": earliest_dt, "end_dt": end_dt}) return jobflow, emr.wait_for_completion(jobflow)