def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool for finding keywords in the "message" field during the run time of a job.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument( '-k', '--keywords', metavar='key', dest='keywords', nargs='*', default=['.*'], help= 'A list of keywords to search for in the Big Data Store. Case insensitive regular expressions (default : .*). If your keyword is a phrase (e.g. "xid 13") regular expressions are not supported at this time.' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Displays any logs that matched the keyword search.') parser.add_argument( '--size', metavar='size', dest='size', default=30, help='The number of results to be returned. (default=30)') parser.add_argument( '-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help= 'A list of hostnames to filter the results to (filters on the "hostname" field, job independent).' ) args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") print("Got {0} Hit(s) for specified job, searching for keywords.".format( total_hits)) if total_hits != 1: print( "This implementation only supports queries where the hit count is equal to 1." ) return 3 # TODO make this code more fault tolerant hits = cast.deep_get(tr_res, "hits", "hits") tr_data = cast.deep_get(hits[0], "_source", "data") # --------------------------------------------------------------------------------------------- # TODO Add utility script to do this. # Build the hostnames string: if args.hosts is None: args.hosts = tr_data.get("compute_nodes") hostnames = { "multi_match": { "query": " ".join(args.hosts), "type": "best_fields", "fields": ["hostname", "source"], "tie_breaker": 0.3, "minimum_should_match": 1 } } # --------------------------------------------------------------------------------------------- # TODO Add a utility script to manage this. date_format = '%Y-%m-%d %H:%M:%S.%f' print_format = '%Y-%m-%d %H:%M:%S:%f' search_format = 'epoch_millis' # Determine the timerange: start_time = datetime.strptime(tr_data.get("begin_time"), date_format) timestamp_range = { "gte": start_time.strftime('%s000'), "format": search_format } # If a history is present end_time is end_time, otherwise it's now. if "history" in tr_data: end_time = datetime.strptime( tr_data.get("history").get("end_time"), date_format) timestamp_range["lte"] = end_time.strftime('%s999') timerange = {"range": {"@timestamp": timestamp_range}} # --------------------------------------------------------------------------------------------- # Build the message query. keywords = {} should_keywords = [] for key in args.keywords: if key.find(" ") == -1: should = {"regexp": {"message": key.lower()}} else: should = {"match_phrase": {"message": key}} should_keywords.append(should) keywords[key] = {"filter": should} # --------------------------------------------------------------------------------------------- # Submit the query body = { "query": { "bool": { "must": [timerange, hostnames, { "exists": { "field": "message" } }], "should": should_keywords, "minimum_should_match": 1 } }, "sort": ["timestamp"], "_source": ["timestamp", "message", "hostname"], "size": args.size, "aggs": keywords } try: key_res = es.search(index="_all", body=body) except exceptions.RequestError as e: cast.print_request_error(e) return 4 # Print the count table. total = cast.deep_get(key_res, 'hits', 'total') print("Got {0} keyword hits:\n".format(total)) aggregations = key_res.get("aggregations") max_width = 7 for key in args.keywords: max_width = max(max_width, len(key)) print('{0: >{1}} | Count'.format("Keyword", max_width)) for agg in aggregations: print('{0: >{1}} | {2}'.format( agg, max_width, cast.deep_get(aggregations, agg, "doc_count"))) print(" ") # Verbosely print the hits if args.verbose: hits = key_res.get('hits', {"hits": []})["hits"] print("Displaying {0} of {1} logs:".format(len(hits), total)) for hit in hits: source = hit["_source"] print("{0} {1} | {2}".format(source.get("timestamp"), source.get("hostname"), source.get("message")))
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool for finding metrics about the nodes participating in the supplied job id.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument('-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') parser.add_argument( '-f', '--fields', metavar='field', dest='fields', nargs='*', default=None, help='A list of fields to retrieve metrics for (REQUIRED).') parser.add_argument('-i', '--index', metavar='index', dest='index', default='_all', help='The index to query for metrics records.') parser.add_argument( '--correlation', action='store_true', help= "Displays the correlation between the supplied fields over the job run." ) args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 if args.fields is None: print("Fields weren't set for metrics analysis.") return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") print("Got {0} Hit(s) for specified job:".format(total_hits)) if total_hits == None: print("# Sorry. Could not find any matching results.") return 0 if total_hits != 1: print( "This implementation only supports queries where the hit count is equal to 1." ) return 3 hits = cast.deep_get(tr_res, "hits", "hits") allocation = cast.deep_get(hits[0], "_source", "data") # --------------------------------------------------------------------------------------------- # Build the hostnames string: if args.hosts is None: args.hosts = allocation.get("compute_nodes") hostnames = { "multi_match": { "query": " ".join(args.hosts), "type": "best_fields", "fields": ["hostname", "source"], "tie_breaker": 0.3, "minimum_should_match": 1 } } # --------------------------------------------------------------------------------------------- date_format = '%Y-%m-%d %H:%M:%S.%f' print_format = '%Y-%m-%d %H:%M:%S:%f' search_format = 'epoch_millis' # Determine the timerange: start_time = datetime.strptime(allocation.get("begin_time"), date_format) timestamp_range = { "gte": "{0}000".format(start_time.strftime('%s')), "format": search_format } # If a history is present end_time is end_time, otherwise it's now. if "history" in allocation: end_time = datetime.strptime( allocation.get("history").get("end_time"), date_format) timestamp_range["lte"] = "{0}999".format(end_time.strftime('%s')) timerange = {"range": {"@timestamp": timestamp_range}} # --------------------------------------------------------------------------------------------- # Matrix stats are very interesting.. stats = {"statistics": {"matrix_stats": {"fields": args.fields}}} for field in args.fields: stats[field] = {"extended_stats": {"field": field}} body = { "query": { "bool": { "must": [hostnames, timerange] } }, "aggs": stats, "size": 0 } try: key_res = es.search( index=args.index, # TODO This should be replaced. body=body) except exceptions.RequestError as e: cast.print_request_error(e) return 4 if args.allocation_id > 0: print("\nMetric Analysis for Allocation ID {0} :\n".format( args.allocation_id)) else: print("\nMetric Analysis for Job ID {0} - {1} :\n".format( args.job_id, args.job_id_secondary)) # Print the table. aggs = cast.deep_get(key_res, "aggregations") if aggs is not None: max_width = len("Field") for agg in aggs: max_width = max(max_width, len(agg)) print("{0:>{1}} | {2: >14} | {3: >14} | {4: >14} | {5: >14} | Count". format("Field", max_width, "Min", "Max", "Average", "Std Dev")) print_fmt = "{0: >{1}} | {2:>14.3f} | {3:>14.3f} | {4:>14.3f} | {5:>14.3f} | {6}" print_str = "{0: >{1}} | {2:>14.3} | {3:>14.3} | {4:>14.3} | {5:>14.3} | {6}" for agg in aggs: try: print( print_fmt.format(agg, max_width, aggs[agg]["min"], aggs[agg]["max"], aggs[agg]["avg"], aggs[agg]["std_deviation"], aggs[agg]["count"])) except ValueError: continue except KeyError: continue #print matrix stats if args.correlation: print("\n{0}".format("=" * 80)) print("Field Correlations:") stat_fields = aggs["statistics"].get("fields", []) for stat in stat_fields: name = stat["name"] print("\n{0}:".format(name)) correlation = stat["correlation"] corr_d = sorted(correlation.items(), key=operator.itemgetter(1)) for field in corr_d: if field[0] != name: print(" {0} : {1}".format(field[0], field[1])) else: print("No aggregations were found.") return 0
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool which takes a weighted listing of keyword searches and presents aggregations of this data to the user.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Displays the top --size logs matching the --errormap mappings.') parser.add_argument( '--size', metavar='size', dest='size', default=10, help='The number of results to be returned. (default=10)') parser.add_argument('-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to.') parser.add_argument( '--errormap', metavar="file", dest="err_map_file", default=None, help='A map of errors to scan the user jobs for, including weights.') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Load the weighted error mapping. error_map = None if args.err_map_file: error_map = JSONSerializer().loads(open(args.err_map_file).read()) if error_map is None: parser.print_help() print("Error map '%s', could not be loaded" % args.err_map_file) return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") # Finding no matches with valid search criteria is a legit case. # return 0, not 3 if total_hits == None: print("# Sorry. Could not find any matching results.") return 0 if total_hits != 1: print( "This implementation only supports queries where the hit count is equal to 1." ) return 3 # TODO make this code more fault tolerant hits = cast.deep_get(tr_res, "hits", "hits") tr_data = cast.deep_get(hits[0], "_source", "data") # --------------------------------------------------------------------------------------------- # Build the hostnames string: if args.hosts is None: args.hosts = tr_data.get("compute_nodes") hostnames = { "multi_match": { "query": " ".join(args.hosts), "type": "best_fields", "fields": ["hostname", "source"], "tie_breaker": 0.3, "minimum_should_match": 1 } } # --------------------------------------------------------------------------------------------- (ranges, should_match) = cast.build_timestamp_range( tr_data.get("begin_time"), cast.deep_get(tr_data, "history", "end_time")) ranges.append(hostnames) # --------------------------------------------------------------------------------------------- # Build a body for the mapping query. body = { "_source": ["@timestamp"], "size": args.size, } # Check the keywords supplied by the json. results = {} for error in error_map: (category, result) = build_mapping_query(es, body.copy(), ranges, error) results[category] = result print(" ") # Print the results. for category, response in sorted( results.iteritems(), key=lambda (k, v): cast.deep_get(v, "hits", "max_score"), reverse=True): # Get aggregations. aggregations = response.get("aggregations", []) total = cast.deep_get(response, "hits", "total") print("\"{0}\" Max Score : {1}".format( category, cast.deep_get(response, "hits", "max_score"))) print("\"{0}\" Count : {1}".format(category, total)) if aggregations is not None: # Sort aggregations by document count. for (aggregation, value) in sorted(aggregations.iteritems(), key=lambda (k, v): v.get("doc_count"), reverse=True): print(" \"{0}\" : {1}".format(aggregation, value.get("doc_count"))) if args.verbose: hits = cast.deep_get(response, "hits", "hits") print("\nTop {0} \"{1}\" Results:".format(len(hits), category)) print("-" * 42) for hit in hits: print(json.dumps(hit["_source"])) print("=" * 42) print(" ")
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description= '''A tool for finding when a job was running through use of the big data store.''' ) parser.add_argument('-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument('-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument('-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help= 'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".' ) parser.add_argument('-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to ') parser.add_argument( '-v', '--verbose', action='store_true', help='Displays additional details about the job in the output.') args = parser.parse_args() # If allocation_id or job_id wasn't specified, printing help on failure. if args.allocation_id == -1 and args.job_id == -1: parser.print_help() print( "Missing either allocationid or jobid. Require 1 of these fields to search." ) return 2 # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target is None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # set up the fields for the search operation. fields = cast.SEARCH_JOB_FIELDS if args.verbose: fields.append("data.compute_nodes") # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch(args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) # Execute the query on the cast-allocation index. try: tr_res = cast.search_job(es, args.allocation_id, args.job_id, args.job_id_secondary, fields=fields) except exceptions.RequestError as e: cast.print_request_error(e) return 4 total_hits = cast.deep_get(tr_res, "hits", "total") print("# Found {0} matches for specified the job.".format(total_hits)) if total_hits == 0: print("# Sorry. Could not find any matching results.") return 0 if total_hits != 1: print( "# This implementation only supports queries where the hit count is equal to 1." ) return 3 # TODO make this code more fault tolerant hits = cast.deep_get(tr_res, "hits", "hits") if len(hits) > 0: tr_data = cast.deep_get(hits[0], "_source", "data") date_format = '%Y-%m-%d %H:%M:%S.%f' print_format = '%Y-%m-%d.%H:%M:%S:%f' search_format = '"yyyy-MM-dd HH:mm:ss:SSS"' start_time = datetime.strptime(tr_data["begin_time"], '%Y-%m-%d %H:%M:%S.%f') start_time = '{0}'.format(start_time.strftime(print_format)[:-3]) # If a history is present end_time is end_time, otherwise it's now. if "history" in tr_data: end_time = datetime.strptime(tr_data["history"]["end_time"], date_format) end_time = '{0}'.format(end_time.strftime(print_format)[:-3]) else: end_time = "now" print("\nallocation-id: {0}".format(tr_data["allocation_id"])) print("job-id: {0} - {1}".format(tr_data["primary_job_id"], tr_data["secondary_job_id"])) print("user-name: {0} \nuser-id: {1}".format(tr_data["user_name"], tr_data["user_id"])) print("begin-time: {0} \nend-time: {1}".format(start_time, end_time)) if args.verbose: nodes = tr_data.get("compute_nodes", []) print('hostnames: ') for node in nodes: print(" - {0}".format(node))