def send_hipchat_deploy_message( replaced_files, new_files, spurious_files, dest_path): """Send a summary of the deploy information to HipChat.""" git_version = parse_git_version() git_msg = parse_git_message() git_version_stamp = git_version includes_local_changes = is_git_dirty() if includes_local_changes: git_version_stamp = "%s+ (including local changes)" % git_version github_url = "https://github.com/Khan/analytics/commit/%s" % git_version deployer_id = popen_results(['whoami']).strip() args = { 'deployer': deployer_id, 'num_files': 0, 'version_stamp': git_version_stamp, 'git_msg': git_msg, 'github_url': github_url, 'dest_path': dest_path, 'summary': summarize_changes(replaced_files, new_files, spurious_files), } message = ("%(deployer)s just deployed files to S3.<br>" + "Destination: %(dest_path)s<br>" + "Version: <a href='%(github_url)s'>%(version_stamp)s</a> " + "- %(git_msg)s<br>" + "%(summary)s") % args notify.send_hipchat(message, ["analytics"])
def monitor_jobflow(jobflow_id): status = emr.wait_for_completion(jobflow_id) listing = emr.list_steps(jobflow_id) jobname = jobflow_id heading = listing.split("\n")[0] # there just happens to be a fixed number of characters (85) in the # output of the 'elastic-mapreduce --list' command before the jobname if len(heading) > 85: jobname += ": " + heading[85:] subject = "Jobflow status = %s (%s)" % (status, jobname) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "STEP FAILED: " + subject notify.send_hipchat(subject) # Until we get more confident, always send email, even on success notify.send_email(subject, listing)
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step["hive_args"]) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step.get("hive_args", {})) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def monitor(config, processes): """Monitor the concurrent processes""" remaining = [] now = time.time() for (process, params) in processes: if process.is_alive(): if (now - params['start']) > int(config["sub_process_time_out"]): #timeout process.terminate() #NOTE: Although it get terminated. The duration should be # re-scheduled with the upcoming control-db implementation. msg = (("Process hung with kind: %s" + " start_dt: %s end_dt: %s" + " after %s seconds") % ( params["kind"], params["start_dt"], params["end_dt"], config["sub_process_time_out"])) g_logger.error(msg) notify.send_hipchat(msg) notify.send_email("WARNING: gae subprocess hung", msg) else: remaining.append((process, params)) processes = remaining
def download_entities(kind, is_ndb, start_dt, end_dt, fetch_interval_seconds, max_entities_per_fetch, max_attempts_per_fetch, index_name, verbose=True): """Downloads all entities between start_dt and end_dt by repeatedly calling attempt_fetch_entities if necessary. Multiple calls are only necessary if there are more entities in the time interval than max_entities_per_fecth. WARNING: because the API call returns entities in [start_dt, end_dt), this, function may return some duplicates in its result. The caller should de-dupe by .key() of the entities if needed. Returns a list of Entities in protocol buffer format. """ entity_list = [] interval_start = start_dt time_delta = dt.timedelta(seconds=fetch_interval_seconds) while interval_start < end_dt: interval_end = min(interval_start + time_delta, end_dt) response = attempt_fetch_entities(kind, is_ndb, interval_start, interval_end, max_entities_per_fetch, max_attempts_per_fetch, index_name, verbose) response_list = pickle.loads(response) entity_list += response_list if len(response_list) == max_entities_per_fetch: # if we maxed out the number of entities for the fetch, there # might still be more so query again from the last timestamp # WARNING: this depends on the implementation of the API call # returning the protobuffs in sorted order # # This works for both db and ndb models. To convert protobufs to # ndb models you'd need to import the model and use a ndb # ModelAdapter. But here we just need access to the # backup_timestamp property (index_name), so deserializing the # protobuf into the lower-level Entity will suffice. pb_first, pb_last = response_list[0], response_list[-1] entity_first = datastore.Entity._FromPb( entity_pb.EntityProto(pb_first)) entity_last = datastore.Entity._FromPb( entity_pb.EntityProto(pb_last)) timestamp_first = entity_first.get(index_name, None) timestamp_last = entity_last.get(index_name, None) if (not timestamp_first or not timestamp_last or timestamp_first == timestamp_last): # TODO(sitan): There is a possibility that the number of # entities with the exact same ISO 8601 timestamp exceeds the # number allowed by max_logs, in which case if we were to query # again, we'd get the same entities back and never update # interval_start. The necessary and sufficient condition for # this is that the ISO 8601 timestamps of the first and last # entity retrieved are the same. In such a case, raise an # error. Ideally, we'd want to use a query cursor to fix this, # but we'd have to change the api call to protobuf-query # because protobuf doesn't return a query cursor. msg = (("Number of entities of kind %s with timestamp %s " + "in range (%s,%s) exceeded max_logs = %s, " + "pickle download failed") % ( kind, timestamp_last, start_dt, end_dt, max_entities_per_fetch)) subject = "Failed to fetch entity, too many matching timestamps" g_logger.error(msg) notify.send_hipchat(msg) notify.send_email(subject, msg) return [] else: interval_start = timestamp_last else: interval_start = interval_end return entity_list
def download_entities(kind, is_ndb, start_dt, end_dt, fetch_interval_seconds, max_entities_per_fetch, max_attempts_per_fetch, index_name, verbose=True): """Downloads all entities between start_dt and end_dt by repeatedly calling attempt_fetch_entities if necessary. Multiple calls are only necessary if there are more entities in the time interval than max_entities_per_fecth. WARNING: because the API call returns entities in [start_dt, end_dt), this, function may return some duplicates in its result. The caller should de-dupe by .key() of the entities if needed. Returns a list of Entities in protocol buffer format. """ entity_list = [] interval_start = start_dt time_delta = dt.timedelta(seconds=fetch_interval_seconds) while interval_start < end_dt: interval_end = min(interval_start + time_delta, end_dt) response = attempt_fetch_entities(kind, is_ndb, interval_start, interval_end, max_entities_per_fetch, max_attempts_per_fetch, index_name, verbose) response_list = pickle.loads(response) entity_list += response_list if len(response_list) == max_entities_per_fetch: # if we maxed out the number of entities for the fetch, there # might still be more so query again from the last timestamp # WARNING: this depends on the implementation of the API call # returning the protobuffs in sorted order # # This works for both db and ndb models. To convert protobufs to # ndb models you'd need to import the model and use a ndb # ModelAdapter. But here we just need access to the # backup_timestamp property (index_name), so deserializing the # protobuf into the lower-level Entity will suffice. pb_first, pb_last = response_list[0], response_list[-1] entity_first = datastore.Entity._FromPb( entity_pb.EntityProto(pb_first)) entity_last = datastore.Entity._FromPb( entity_pb.EntityProto(pb_last)) timestamp_first = entity_first.get(index_name, None) timestamp_last = entity_last.get(index_name, None) if (not timestamp_first or not timestamp_last or timestamp_first == timestamp_last): # TODO(sitan): There is a possibility that the number of # entities with the exact same ISO 8601 timestamp exceeds the # number allowed by max_logs, in which case if we were to query # again, we'd get the same entities back and never update # interval_start. The necessary and sufficient condition for # this is that the ISO 8601 timestamps of the first and last # entity retrieved are the same. In such a case, raise an # error. Ideally, we'd want to use a query cursor to fix this, # but we'd have to change the api call to protobuf-query # because protobuf doesn't return a query cursor. msg = (("Number of entities of kind %s with timestamp %s " + "in range (%s,%s) exceeded max_logs = %s, " + "pickle download failed") % (kind, timestamp_last, start_dt, end_dt, max_entities_per_fetch)) subject = "Failed to fetch entity, too many matching timestamps" g_logger.error(msg) notify.send_hipchat(msg) notify.send_email(subject, msg) return [] else: interval_start = timestamp_last else: interval_start = interval_end return entity_list
# set the usage alert limit. default to 90% threshold = int(sys.argv[2]) if len(sys.argv) == 3 else 90 # run 'df -h' an capture the output lines df_output = subprocess.check_output(["df", "-h"]) lines = df_output.split("\n") # filter for filesystems we care about lines = [line for line in lines if line.startswith("/")] warn = False for line in lines: print line # grab a string percentage of usage, e.g., '78%' use_pct = line.split()[4] # convert to a number use_pct = int(use_pct[:-1]) if use_pct > threshold: warn = True break if warn: message = ("WARNING: disk space low on machine '%s'. " "Try running archive_to_s3.py" % hostname) print >> sys.stderr, message print >> sys.stderr, df_output notify.send_hipchat(message) notify.send_email("WARNING: low disk space", message + "\n\n" + df_output)