def run_hive_jobs(start_dt, end_dt):
    jobname = "Daily Exercise Stats (%s to %s)" % (start_dt, end_dt)
    jobflow = emr.create_hive_cluster(jobname, {})

    # TODO(jace): make sure the required data (ProblemLogs, etc)
    # is available before running these downstream summaries

    day = datetime.datetime.strptime(end_dt, DATE_FORMAT)
    start_day = datetime.datetime.strptime(start_dt, DATE_FORMAT)
    day -= datetime.timedelta(days=1)  # end date is exclusive
    while day >= start_day:
        emr.add_hive_step(jobflow, {},
                          's3://ka-mapreduce/code/hive/daily_ex_stats.q',
                          script_args={
                              "dt": day.strftime(DATE_FORMAT),
                              "branch": "/"
                          })
        day -= datetime.timedelta(days=1)

    return jobflow, emr.wait_for_completion(jobflow)
Example #2
0
def run_hive_jobs(start_dt, end_dt):
    jobname = "Daily Exercise Stats (%s to %s)" % (start_dt, end_dt)
    jobflow = emr.create_hive_cluster(jobname, {})

    # TODO(jace): make sure the required data (ProblemLogs, etc)
    # is available before running these downstream summaries

    day = datetime.datetime.strptime(end_dt, DATE_FORMAT)
    start_day = datetime.datetime.strptime(start_dt, DATE_FORMAT)
    day -= datetime.timedelta(days=1)  # end date is exclusive
    while day >= start_day:
        emr.add_hive_step(
            jobflow,
            {},
            "s3://ka-mapreduce/code/hive/daily_ex_stats.q",
            script_args={"dt": day.strftime(DATE_FORMAT), "branch": "/"},
        )
        day -= datetime.timedelta(days=1)

    return jobflow, emr.wait_for_completion(jobflow)
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

    Arguments:
      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    """
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
            continue
        emr.add_hive_step(jobflow, {},
                          hive_script=step["hive_script"],
                          script_args=step["hive_args"])

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        notify.send_hipchat(subject)
    else:
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
        g_logger.fatal(emr.list_steps(jobflow))
        sys.exit(1)
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

    Arguments:
      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    """
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
            continue
        emr.add_hive_step(jobflow, {},
                          hive_script=step["hive_script"],
                          script_args=step.get("hive_args", {}))

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        notify.send_hipchat(subject)
    else:
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
        g_logger.fatal(emr.list_steps(jobflow))
        sys.exit(1)
Example #5
0
def run_hive_jobs(start_dt, end_dt, earliest_dt):
    jobname = "Growth Reporting (%s to %s)" % (start_dt, end_dt)
    jobflow = emr.create_hive_cluster(jobname, {})

    # TODO(jace): make sure the required data (ProblemLogs, etc)
    # is available before running these downstream summaries
    emr.add_hive_step(jobflow, {},
            hive_script='s3://ka-mapreduce/code/hive/user_daily_activity.q',
            script_args={"start_dt": start_dt, "end_dt": end_dt})
    emr.add_hive_step(jobflow, {},
            hive_script='s3://ka-mapreduce/code/hive/user_growth.q',
            script_args={"start_dt": earliest_dt, "end_dt": end_dt})
    emr.add_hive_step(jobflow, {},
            hive_script='s3://ka-mapreduce/code/hive/company_metrics.q',
            script_args={"start_dt": earliest_dt, "end_dt": end_dt})

    return jobflow, emr.wait_for_completion(jobflow)