Esempio n. 1
def main():
    parser = optparse.OptionParser()
    parser.add_option("-b", "--begindate", help="In format YYYY-MM-DD.")
    parser.add_option("-e", "--enddate", help="In format YYYY-MM-DD.")
    options, dummy = parser.parse_args()

    today = datetime.datetime.combine(, datetime.time())
    yesterday = today - datetime.timedelta(days=1)

    if options.begindate and options.enddate:
        start_date = options.begindate
        end_date = options.enddate
        start_date = yesterday.strftime(DATE_FORMAT)
        end_date = today.strftime(DATE_FORMAT)

    jobflow, status = run_hive_jobs(start_date, end_date)

    print "Jobflow %s ended with status %s." % (jobflow, status)
    if status != "COMPLETED":
        # if cronned, this will get sent as email
        print >> sys.stderr, emr.list_steps(jobflow)

    # Jobflow was successful, so transfer the data to
    # the Mongo reporting db used by dashboards
    run_report_importer("daily_exercise_stats", "daily_exercise_stats")
Esempio n. 2
def main():
    parser = optparse.OptionParser()
    parser.add_option("-b", "--begindate", help="In format YYYY-MM-DD.")
    parser.add_option("-e", "--enddate", help="In format YYYY-MM-DD.")
    options, dummy = parser.parse_args()

    today = datetime.datetime.combine(, datetime.time())
    yesterday = today - datetime.timedelta(days=1)

    if options.begindate and options.enddate:
        start_date = options.begindate
        end_date = options.enddate
        start_date = yesterday.strftime(DATE_FORMAT)
        end_date = today.strftime(DATE_FORMAT)

    jobflow, status = run_hive_jobs(start_date, end_date)

    print "Jobflow %s ended with status %s." % (jobflow, status)
    if status != "COMPLETED":
        # if cronned, this will get sent as email
        print >> sys.stderr, emr.list_steps(jobflow)

    # Jobflow was successful, so transfer the data to
    # the Mongo reporting db used by dashboards
    run_report_importer("daily_exercise_stats", "daily_exercise_stats")
Esempio n. 3
def main():
    parser = optparse.OptionParser()
    parser.add_option("-b", "--begindate", help="In format YYYY-MM-DD.")
    parser.add_option("-e", "--enddate", help="In format YYYY-MM-DD.")
    options, dummy = parser.parse_args()

    today = datetime.datetime.combine(, datetime.time())
    yesterday = today - datetime.timedelta(days=1)

    if options.begindate and options.enddate:
        start_date = options.begindate
        end_date = options.enddate
        start_date = yesterday.strftime("%Y-%m-%d")
        end_date = today.strftime("%Y-%m-%d")
    earliest_date = "2011-01-01"  # farthest back we've ever populated

    jobflow, status = run_hive_jobs(start_date, end_date, earliest_date)

    print "Jobflow %s ended with status %s." % (jobflow, status)
    if status != "COMPLETED":
        # if cronned, this will get sent as email
        print >>sys.stderr, emr.list_steps(jobflow)

    # Jobflow was successful, so transfer the data to
    # the Mongo reporting db used by dashboards
    run_report_importer("user_growth", "user_growth")
    run_report_importer("company_metrics", "company_metrics")
Esempio n. 4
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
        emr.add_hive_step(jobflow, {},

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
Esempio n. 5
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
        emr.add_hive_step(jobflow, {},
                          script_args=step.get("hive_args", {}))

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
Esempio n. 6
def monitor_jobflow(jobflow_id):
    status = emr.wait_for_completion(jobflow_id)

    listing = emr.list_steps(jobflow_id)
    jobname = jobflow_id
    heading = listing.split("\n")[0]
    # there just happens to be a fixed number of characters (85) in the
    # output of the 'elastic-mapreduce --list' command before the jobname
    if len(heading) > 85:
        jobname += ": " + heading[85:]

    subject = "Jobflow status = %s (%s)" % (status, jobname)

    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "STEP FAILED: " + subject

    # Until we get more confident, always send email, even on success
    notify.send_email(subject, listing)
def monitor_jobflow(jobflow_id):
    status = emr.wait_for_completion(jobflow_id)

    listing = emr.list_steps(jobflow_id)
    jobname = jobflow_id
    heading = listing.split("\n")[0]
    # there just happens to be a fixed number of characters (85) in the
    # output of the 'elastic-mapreduce --list' command before the jobname
    if len(heading) > 85:
        jobname += ": " + heading[85:]

    subject = "Jobflow status = %s (%s)" % (status, jobname)

    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "STEP FAILED: " + subject

    # Until we get more confident, always send email, even on success
    notify.send_email(subject, listing)