Exemple #1
0
def send_hipchat_deploy_message(
        replaced_files, new_files, spurious_files, dest_path):
    """Send a summary of the deploy information to HipChat."""

    git_version = parse_git_version()
    git_msg = parse_git_message()

    git_version_stamp = git_version
    includes_local_changes = is_git_dirty()
    if includes_local_changes:
        git_version_stamp = "%s+ (including local changes)" % git_version

    github_url = "https://github.com/Khan/analytics/commit/%s" % git_version

    deployer_id = popen_results(['whoami']).strip()

    args = {
        'deployer': deployer_id,
        'num_files': 0,
        'version_stamp': git_version_stamp,
        'git_msg': git_msg,
        'github_url': github_url,
        'dest_path': dest_path,
        'summary': summarize_changes(replaced_files,
                                     new_files,
                                     spurious_files),
    }
    message = ("%(deployer)s just deployed files to S3.<br>" +
               "Destination: %(dest_path)s<br>" +
               "Version: <a href='%(github_url)s'>%(version_stamp)s</a> " +
               "- %(git_msg)s<br>" +
               "%(summary)s") % args

    notify.send_hipchat(message, ["analytics"])
Exemple #2
0
def send_hipchat_deploy_message(
        replaced_files, new_files, spurious_files, dest_path):
    """Send a summary of the deploy information to HipChat."""

    git_version = parse_git_version()
    git_msg = parse_git_message()

    git_version_stamp = git_version
    includes_local_changes = is_git_dirty()
    if includes_local_changes:
        git_version_stamp = "%s+ (including local changes)" % git_version

    github_url = "https://github.com/Khan/analytics/commit/%s" % git_version

    deployer_id = popen_results(['whoami']).strip()

    args = {
        'deployer': deployer_id,
        'num_files': 0,
        'version_stamp': git_version_stamp,
        'git_msg': git_msg,
        'github_url': github_url,
        'dest_path': dest_path,
        'summary': summarize_changes(replaced_files,
                                     new_files,
                                     spurious_files),
    }
    message = ("%(deployer)s just deployed files to S3.<br>" +
               "Destination: %(dest_path)s<br>" +
               "Version: <a href='%(github_url)s'>%(version_stamp)s</a> " +
               "- %(git_msg)s<br>" +
               "%(summary)s") % args

    notify.send_hipchat(message, ["analytics"])
Exemple #3
0
def monitor_jobflow(jobflow_id):
    status = emr.wait_for_completion(jobflow_id)

    listing = emr.list_steps(jobflow_id)
    jobname = jobflow_id
    heading = listing.split("\n")[0]
    # there just happens to be a fixed number of characters (85) in the
    # output of the 'elastic-mapreduce --list' command before the jobname
    if len(heading) > 85:
        jobname += ": " + heading[85:]

    subject = "Jobflow status = %s (%s)" % (status, jobname)

    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "STEP FAILED: " + subject
        notify.send_hipchat(subject)

    # Until we get more confident, always send email, even on success
    notify.send_email(subject, listing)
def monitor_jobflow(jobflow_id):
    status = emr.wait_for_completion(jobflow_id)

    listing = emr.list_steps(jobflow_id)
    jobname = jobflow_id
    heading = listing.split("\n")[0]
    # there just happens to be a fixed number of characters (85) in the
    # output of the 'elastic-mapreduce --list' command before the jobname
    if len(heading) > 85:
        jobname += ": " + heading[85:]

    subject = "Jobflow status = %s (%s)" % (status, jobname)

    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "STEP FAILED: " + subject
        notify.send_hipchat(subject)

    # Until we get more confident, always send email, even on success
    notify.send_email(subject, listing)
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

    Arguments:
      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    """
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
            continue
        emr.add_hive_step(jobflow, {},
                          hive_script=step["hive_script"],
                          script_args=step["hive_args"])

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        notify.send_hipchat(subject)
    else:
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
        g_logger.fatal(emr.list_steps(jobflow))
        sys.exit(1)
def run_hive_jobs(jobname, steps, num_instances):
    """Run hive steps.

    Arguments:
      jobname: Name for the Amazon EMR job.
      steps: A sequence of dictionaries describing the job steps to add.
        Each step may specify the keys "hive_script" and "hive_args". If
        "hive_script" is missing, no job step will be added. These steps
        usually come directly from a configuration file.
      num_instances: The number of instances to run this job on. Equivalent
        to the EMR CLI option --num-instances.

    Calls sys.exit() when a job does not complete successfully.
    """
    jobflow = emr.create_hive_cluster(
            jobname, {"num_instances": num_instances})
    for step in steps:
        # It's possible to leave out hive_script and hive_args, for
        # when the step just wants to move data from hive into mongo,
        # and not run any hive script.
        if 'hive_script' not in step:
            continue
        emr.add_hive_step(jobflow, {},
                          hive_script=step["hive_script"],
                          script_args=step.get("hive_args", {}))

    status = emr.wait_for_completion(jobflow, logger=g_logger)
    listing = emr.list_steps(jobflow)
    failures = ["FAILED", "CANCELLED", "TERMINATED"]
    if any(s in listing for s in failures):
        subject = "Reporting jobflow FAILED: %s" % jobname
        notify.send_email(subject, listing)
        notify.send_hipchat(subject)
    else:
        subject = "Reporting jobflow SUCCEEDED: %s" % jobname
        notify.send_email(subject, listing)
    if status != "COMPLETED":
        g_logger.fatal("Hive jobs failed")
        g_logger.fatal(emr.list_steps(jobflow))
        sys.exit(1)
Exemple #7
0
def monitor(config, processes):
    """Monitor the concurrent processes"""
    remaining = []
    now = time.time()
    for (process, params) in processes:
        if process.is_alive():
            if (now - params['start']) > int(config["sub_process_time_out"]):
                #timeout
                process.terminate()
                #NOTE: Although it get terminated. The duration should be
                # re-scheduled with the upcoming control-db implementation.
                msg = (("Process hung with kind: %s" + 
                        " start_dt: %s end_dt: %s" +
                        " after %s seconds") % (
                        params["kind"], params["start_dt"], 
                        params["end_dt"], config["sub_process_time_out"]))
                g_logger.error(msg)
                notify.send_hipchat(msg)
                notify.send_email("WARNING: gae subprocess hung", msg)
            else:
                remaining.append((process, params))
    processes = remaining
Exemple #8
0
def download_entities(kind,
                      is_ndb,
                      start_dt, end_dt,
                      fetch_interval_seconds,
                      max_entities_per_fetch,
                      max_attempts_per_fetch,
                      index_name,
                      verbose=True):
    """Downloads all entities between start_dt and end_dt  by
    repeatedly calling attempt_fetch_entities if necessary.  Multiple calls
    are only necessary if there are more entities in the time interval
    than max_entities_per_fecth.

    WARNING: because the API call returns entities in [start_dt, end_dt),
    this, function may return some duplicates in its result.  The caller should
    de-dupe by .key() of the entities if needed.

    Returns a list of Entities in protocol buffer format.
    """

    entity_list = []
    interval_start = start_dt
    time_delta = dt.timedelta(seconds=fetch_interval_seconds)
    while interval_start < end_dt:
        interval_end = min(interval_start + time_delta, end_dt)
        response = attempt_fetch_entities(kind,
                                          is_ndb,
                                          interval_start, interval_end,
                                          max_entities_per_fetch,
                                          max_attempts_per_fetch,
                                          index_name,
                                          verbose)
        response_list = pickle.loads(response)
        entity_list += response_list

        if len(response_list) == max_entities_per_fetch:
            # if we maxed out the number of entities for the fetch, there
            # might still be more so query again from the last timestamp
            # WARNING: this depends on the implementation of the API call
            # returning the protobuffs in sorted order
            #
            # This works for both db and ndb models. To convert protobufs to
            # ndb models you'd need to import the model and use a ndb
            # ModelAdapter. But here we just need access to the
            # backup_timestamp property (index_name), so deserializing the
            # protobuf into the lower-level Entity will suffice.
            pb_first, pb_last = response_list[0], response_list[-1]
            entity_first = datastore.Entity._FromPb(
                entity_pb.EntityProto(pb_first))
            entity_last = datastore.Entity._FromPb(
                entity_pb.EntityProto(pb_last))
            timestamp_first = entity_first.get(index_name, None)
            timestamp_last = entity_last.get(index_name, None)

            if (not timestamp_first or not timestamp_last or
                    timestamp_first == timestamp_last):
                # TODO(sitan): There is a possibility that the number of 
                # entities with the exact same ISO 8601 timestamp exceeds the 
                # number allowed by max_logs, in which case if we were to query
                # again, we'd get the same entities back and never update 
                # interval_start. The necessary and sufficient condition for
                # this is that the ISO 8601 timestamps of the first and last 
                # entity retrieved are the same. In such a case, raise an
                # error. Ideally, we'd want to use a query cursor to fix this,
                # but we'd have to change the api call to protobuf-query
                # because protobuf doesn't return a query cursor.
                msg = (("Number of entities of kind %s with timestamp %s " +
                        "in range (%s,%s) exceeded max_logs = %s, " +
                        "pickle download failed") % (
                        kind, timestamp_last, start_dt,
                        end_dt, max_entities_per_fetch))
                subject = "Failed to fetch entity, too many matching timestamps"
                g_logger.error(msg)
                notify.send_hipchat(msg)
                notify.send_email(subject, msg)
                return []
            else:
                interval_start = timestamp_last
        else:
            interval_start = interval_end
    return entity_list
Exemple #9
0
def download_entities(kind,
                      is_ndb,
                      start_dt,
                      end_dt,
                      fetch_interval_seconds,
                      max_entities_per_fetch,
                      max_attempts_per_fetch,
                      index_name,
                      verbose=True):
    """Downloads all entities between start_dt and end_dt  by
    repeatedly calling attempt_fetch_entities if necessary.  Multiple calls
    are only necessary if there are more entities in the time interval
    than max_entities_per_fecth.

    WARNING: because the API call returns entities in [start_dt, end_dt),
    this, function may return some duplicates in its result.  The caller should
    de-dupe by .key() of the entities if needed.

    Returns a list of Entities in protocol buffer format.
    """

    entity_list = []
    interval_start = start_dt
    time_delta = dt.timedelta(seconds=fetch_interval_seconds)
    while interval_start < end_dt:
        interval_end = min(interval_start + time_delta, end_dt)
        response = attempt_fetch_entities(kind, is_ndb, interval_start,
                                          interval_end, max_entities_per_fetch,
                                          max_attempts_per_fetch, index_name,
                                          verbose)
        response_list = pickle.loads(response)
        entity_list += response_list

        if len(response_list) == max_entities_per_fetch:
            # if we maxed out the number of entities for the fetch, there
            # might still be more so query again from the last timestamp
            # WARNING: this depends on the implementation of the API call
            # returning the protobuffs in sorted order
            #
            # This works for both db and ndb models. To convert protobufs to
            # ndb models you'd need to import the model and use a ndb
            # ModelAdapter. But here we just need access to the
            # backup_timestamp property (index_name), so deserializing the
            # protobuf into the lower-level Entity will suffice.
            pb_first, pb_last = response_list[0], response_list[-1]
            entity_first = datastore.Entity._FromPb(
                entity_pb.EntityProto(pb_first))
            entity_last = datastore.Entity._FromPb(
                entity_pb.EntityProto(pb_last))
            timestamp_first = entity_first.get(index_name, None)
            timestamp_last = entity_last.get(index_name, None)

            if (not timestamp_first or not timestamp_last
                    or timestamp_first == timestamp_last):
                # TODO(sitan): There is a possibility that the number of
                # entities with the exact same ISO 8601 timestamp exceeds the
                # number allowed by max_logs, in which case if we were to query
                # again, we'd get the same entities back and never update
                # interval_start. The necessary and sufficient condition for
                # this is that the ISO 8601 timestamps of the first and last
                # entity retrieved are the same. In such a case, raise an
                # error. Ideally, we'd want to use a query cursor to fix this,
                # but we'd have to change the api call to protobuf-query
                # because protobuf doesn't return a query cursor.
                msg = (("Number of entities of kind %s with timestamp %s " +
                        "in range (%s,%s) exceeded max_logs = %s, " +
                        "pickle download failed") %
                       (kind, timestamp_last, start_dt, end_dt,
                        max_entities_per_fetch))
                subject = "Failed to fetch entity, too many matching timestamps"
                g_logger.error(msg)
                notify.send_hipchat(msg)
                notify.send_email(subject, msg)
                return []
            else:
                interval_start = timestamp_last
        else:
            interval_start = interval_end
    return entity_list
Exemple #10
0
# set the usage alert limit.  default to 90%
threshold = int(sys.argv[2]) if len(sys.argv) == 3 else 90

# run 'df -h' an capture the output lines
df_output = subprocess.check_output(["df", "-h"])

lines = df_output.split("\n")
# filter for filesystems we care about
lines = [line for line in lines if line.startswith("/")]

warn = False
for line in lines:
    print line
    # grab a string percentage of usage, e.g., '78%'
    use_pct = line.split()[4]
    # convert to a number
    use_pct = int(use_pct[:-1])
    if use_pct > threshold:
        warn = True
        break

if warn:
    message = ("WARNING: disk space low on machine '%s'. "
        "Try running archive_to_s3.py" % hostname)
    print >> sys.stderr, message
    print >> sys.stderr, df_output

    notify.send_hipchat(message)
    notify.send_email("WARNING: low disk space", message + "\n\n" + df_output)