def _get_all_fields (app, app_type=None): """ Retrieve all possible fields in an application :param app: [string] application name (e.g. xdata_v3) :param app_type: [string] application type (e.g. logs) :return: [list] list of strings representing the fields names """ d = list () query = { "aggs" : { "fields" : { "terms" : { "field" : "_field_names", "size" : 100 } } } } try: response = es.search (index=app, doc_type=app_type, body=query) for tag in response['aggregations']['fields']['buckets']: d.append (tag ['key']) except TransportError as e: d.append (str (e.info)) except Exception as e: d.append (str (e)) return d
def segment(app, app_type=None, params=''): """ Just support match all for now. """ q = params.get("q") if params.get("q") else {} fields = params.get("fields") if params.get("fields") else [] size = params.get("size") if params.get("size") else 10 scroll = params.get("scroll") if params.get("scroll") else False fl = params.get("fl") if params.get("fl") else [] # filters = params.get ("filter") if params.get ("filter") else {} # 'q': args.get('q', '{}'), # 'fields': args.get('fl', '{}'), # 'size': args.get ('size', 100), # 'scroll': args.get ('scroll', False), # 'filters': request_args.getlist ('fq') query = {} query['size'] = size if q: res = q.split(":") key = res[0] val = res[1] query['query'] = {"match": {key: val}} else: query['query'] = {"match_all": {}} if len(fields) > 0: ex = {"include": fields.split(",")} query['_source'] = ex response = es.search(index=app, doc_type=app_type, body=query) return jsonify(response)
def _get_all_fields(app, app_type=None): """ Retrieve all possible fields in an application :param app: [string] application name (e.g. xdata_v3) :param app_type: [string] application type (e.g. logs) :return: [list] list of strings representing the fields names """ d = list() query = { "aggs": { "fields": { "terms": { "field": "_field_names", "size": 100 } } } } try: response = es.search(index=app, doc_type=app_type, body=query) for tag in response['aggregations']['fields']['buckets']: d.append(tag['key']) except TransportError as e: d.append(str(e.info)) except Exception as e: d.append(str(e)) return d
def search(app, app_type=None, filters=list(), size=100, include="*", scroll=None, sort_field=None): """ Perform a search query. :param app: [string] application id (e.g. "xdata_v3") :param app_type: [string] name of the application type. If None all application types are searched. :param filters: [list of strings] list of filters for a query. :param size: [int] maximum number of hits that should be returned :param sort_field: [string] sorting field. Currently supported fields: "timestamp", "date" :return: [dict] dictionary with processed results. If STOUT is enabled, STOUT data will be merged with final result. """ # Need some query builder... query = {} log_result = es.search(index=app, doc_type=app_type, body=query, fields=filters, size=size) # stout_result = Stout.getSessions() # # data = merged_results(log_result, stout_result) return log_result
def search (app, app_type=None, filters=list (), size=100, include="*", scroll=None, sort_field=None): """ Perform a search query. :param app: [string] application id (e.g. "xdata_v3") :param app_type: [string] name of the application type. If None all application types are searched. :param filters: [list of strings] list of filters for a query. :param size: [int] maximum number of hits that should be returned :param sort_field: [string] sorting field. Currently supported fields: "timestamp", "date" :return: [dict] dictionary with processed results. If STOUT is enabled, STOUT data will be merged with final result. """ # Need some query builder... log_result = es.search (index=app, doc_type=app_type, body=query, fields=filters, size=size) stout_result = Stout.getSessions () data = merged_results (log_result, stout_result) return data
def histogram(app, app_type=None, q=""): """ Only works on numerical data. """ field = q.get("field") if q.get("field") else "" interval = 50 query = { "aggs": { "hist_agg": { "histogram": { "field": field, "interval": interval } } } } d = {} try: response = es.search(index=app, doc_type=app_type, body=query) for tag in response['aggregations']['hist_agg']['buckets']: d[tag['key']] = tag['doc_count'] except TransportError as e: d['error'] = e.info except Exception as e: d['error'] = str(e) return jsonify(d)
def histogram (app, app_type=None, q=""): """ Only works on numerical data. """ field = q.get ("field") if q.get ("field") else "" interval = 50 query = { "aggs" : { "hist_agg" : { "histogram" : { "field" : field, "interval" : interval } } } } d = {} try: response = es.search (index=app, doc_type=app_type, body=query) for tag in response['aggregations']['hist_agg']['buckets']: d [tag ['key']] = tag ['doc_count'] except TransportError as e: d ['error'] = e.info except Exception as e: d ['error'] = str (e) return jsonify (d)
def unique_terms(app, app_type=None, q=""): """ Aggregate the number of unique terms in a field. Missing values are counted and marked as "N/A". .. todo:: Need to incorporate QueryBuilder library instead of manually generating queries. :param app: [string] application name :param app_type: [string] application type :param field: [string] field to search against for unique values :param size: [int] the top size terms returned in the result. Default value is 10. :param min_hits: [int] return tags which have been found in min_hits or more. Default value is 1. :return: [dict] dictionary of results """ field = q.get("field") if q.get("field") else "" size = q.get("size") if q.get("size") else 10000 min_hits = q.get("min_hits") if q.get("min_hits") else 0 print field query = { "aggs": { "terms_agg": { "terms": { "field": field, "size": size, "min_doc_count": min_hits, "missing": "N/A" } } } } d = {} try: response = es.search(index=app, doc_type=app_type, body=query) for tag in response['aggregations']['terms_agg']['buckets']: d[tag['key']] = tag['doc_count'] except TransportError as e: d['error'] = e.info except Exception as e: d['error'] = str(e) return jsonify(d)
def unique_terms (app, app_type=None, q=""): """ Aggregate the number of unique terms in a field. Missing values are counted and marked as "N/A". .. todo:: Need to incorporate QueryBuilder library instead of manually generating queries. :param app: [string] application name :param app_type: [string] application type :param field: [string] field to search against for unique values :param size: [int] the top size terms returned in the result. Default value is 10. :param min_hits: [int] return tags which have been found in min_hits or more. Default value is 1. :return: [dict] dictionary of results """ field = q.get ("field") if q.get ("field") else "" size = q.get ("size") if q.get ("size") else 10000 min_hits = q.get ("min_hits") if q.get ("min_hits") else 0 print field query = { "aggs" : { "terms_agg" : { "terms" : { "field" : field, "size" : size, "min_doc_count" : min_hits, "missing" : "N/A" } } } } d = {} try: response = es.search (index=app, doc_type=app_type, body=query) for tag in response['aggregations']['terms_agg']['buckets']: d [tag ['key']] = tag ['doc_count'] except TransportError as e: d ['error'] = e.info except Exception as e: d ['error'] = str (e) return jsonify (d)
def terms(app, app_type=None, q=''): """ Group by field (find all elements ) """ field = q.get("field") if q.get("field") else "" segment = q.get("seg") if q.get("seg") else "*" size = q.get("size") if q.get("size") else 10000 numhits = q.get("numhits") if q.get("numhits") else 10 query = { "aggs": { "count_by_type": { "terms": { "field": field, "size": size # maximum number of keys (unique fields) }, "aggs": { "top": { # arbitrary name "top_hits": { "size": numhits, # number of logs in subgroup "_source": { # segment on fields - return only subgroup based on field "include": [segment] } } } } } } } d = {} # try: response = es.search(index=app, doc_type=app_type, body=query) # for tag in response['aggregations']['count_by_type']['buckets']: # d [tag ['key']] = tag ['doc_count'] # except TransportError as e: # d ['error'] = e.info # except Exception as e: # d ['error'] = str (e) # return jsonify (d) return jsonify(response)
def terms (app, app_type=None, q=''): """ Group by field (find all elements ) """ field = q.get ("field") if q.get ("field") else "" segment = q.get ("seg") if q.get ("seg") else "*" size = q.get ("size") if q.get ("size") else 10000 numhits = q.get ("numhits") if q.get ("numhits") else 10 query = { "aggs" : { "count_by_type" : { "terms" : { "field" : field, "size" : size # maximum number of keys (unique fields) }, "aggs" : { "top" : { # arbitrary name "top_hits" : { "size" : numhits, # number of logs in subgroup "_source" : { # segment on fields - return only subgroup based on field "include" : [ segment ] } } } } } } } d = {} # try: response = es.search (index=app, doc_type=app_type, body=query) # for tag in response['aggregations']['count_by_type']['buckets']: # d [tag ['key']] = tag ['doc_count'] # except TransportError as e: # d ['error'] = e.info # except Exception as e: # d ['error'] = str (e) # return jsonify (d) return jsonify (response)
def get_applications(): """ Fetch all the registered applications in Distill. .. note:: Private indexes starting with a period are not included in the result set :return: [dict] dictionary of all registered applications and meta info """ doc = {} query = { "aggs": { "count_by_type": { "terms": { "field": "_type", "size": 100 } } } } try: cluster_status = es.cat.indices(h=["index"], pri=False) x = cluster_status.splitlines() for idx in x: idx = idx.rstrip() # Ignore private indexes (like .kibana or .stout) if idx[:1] != '.': response = es.search(index=idx, body=query) d = {} for tag in response["aggregations"]["count_by_type"][ "buckets"]: d[tag['key']] = tag['doc_count'] doc[idx] = d except TransportError as e: doc['error'] = e.info except Exception as e: doc['error'] = str(e) return doc
def get_applications (): """ Fetch all the registered applications in Distill. .. note:: Private indexes starting with a period are not included in the result set :return: [dict] dictionary of all registered applications and meta information """ doc = {} query = { "aggs" : { "count_by_type" : { "terms" : { "field" : "_type", "size" : 100 } } } } try: cluster_status = es.cat.indices (h=["index"], pri=False) x = cluster_status.splitlines() for idx in x: idx = idx.rstrip () # Ignore private indexes (like .kibana or .stout) if idx [:1] != '.': response = es.search (index=idx, body=query) d = {} for tag in response["aggregations"]["count_by_type"]["buckets"]: d [tag ['key']] = tag ['doc_count'] doc [idx] = d except TransportError as e: doc ['error'] = e.info except Exception as e: doc ['error'] = str (e) return doc
def segment (app, app_type=None, params=''): """ Just support match all for now. """ q = params.get ("q") if params.get ("q") else {} fields = params.get ("fields") if params.get ("fields") else [] size = params.get ("size") if params.get ("size") else 10 scroll = params.get ("scroll") if params.get ("scroll") else False fl = params.get ("fl") if params.get ("fl") else [] # filters = params.get ("filter") if params.get ("filter") else {} # 'q': args.get('q', '{}'), # 'fields': args.get('fl', '{}'), # 'size': args.get ('size', 100), # 'scroll': args.get ('scroll', False), # 'filters': request_args.getlist ('fq') query = {} query ['size'] = size if q: res = q.split(":") key = res [0] val = res [1] query ['query'] = {"match" : { key : val } } else: query ['query'] = {"match_all" : {}} if len (fields) > 0: ex = { "include" : fields.split(",") } query ['_source'] = ex response = es.search (index=app, doc_type=app_type, body=query) return jsonify (response)
def generate_graph(app, app_type='logs', log_type='raw', targets=[], events=[], time_range=['now-1h', 'now'], size=20): """ Return all elements from an application, possible matching against a specific event type (e.g. click, mouseover, etc) """ # @TODO ref_url filter must_not_query = [{ "term": { "type": "mousedown" } }, { "term": { "type": "mouseup" } }] filter_query = [ { "term": { "logType": log_type }, }, ] # Filtering should_query = [] must_query = [] # Include these events in the request if events: include_events = {"terms": {"type": events}} filter_query.append(include_events) target_in = targets[0] target_out = targets[1] if target_in: include_targets = {"terms": {"target": target_in}} filter_query.append(include_targets) # Remove these elementids from result set for target in target_out: res = {"term": {"target": target}} must_not_query.append(res) # Finish off should query # must_query.append({"bool": {"should": should_query}}) # Sort By Time sort_query = [{"clientTime": {"order": "asc"}}] # Timestamp range - date math timestamp_query = { "range": { "@timestamp": { "gte": time_range[0], "lte": time_range[1] } } } filter_query.append(timestamp_query) agg_query = dict() # Get all unique sessions session_query = {"terms": {"field": "sessionID", "min_doc_count": 1}} agg_query['sessions'] = session_query # Generating all top targets and breakdowns by type, including path_length target_query = { "terms": { "field": "target", "min_doc_count": 1, "size": size }, "aggs": { "events": { "terms": { "field": "type", "min_doc_count": 1, "size": size } }, "top_target": { "top_hits": { "script_fields": { "path_length": { "script": { "lang": "painless", "inline": "doc['path'].length;" } } }, "size": 1 } } } } agg_query['targets'] = target_query # Main query query = { "sort": sort_query, "query": { "bool": { # "must": must_query, # "should": should_query, "filter": filter_query, "must_not": must_not_query, # "minimum_should_match": len(should_query) - 1 } }, "_source": { "includes": ['*'], }, "script_fields": { "path_length": { "script": { "lang": "painless", "inline": "doc['path'].length;" } } }, "aggregations": agg_query } # return query # Process Aggregate Results response = es.search(app, doc_type=app_type, body=query, size=0) # Only want to look at aggregations sessions = response['aggregations']['sessions']['buckets'] # allSessions = { x['key']: [] for x in sessions } # intervalSessions = { x['key']: [] for x in sessions } # Deal with bar chart allTargets = response['aggregations']['targets']['buckets'] # Re-execute query to get all hits iter = helpers.scan(es, query=query, index=app, doc_type=app_type, preserve_order=True) allSessions = dict() # Store all hits in the user's bucket. for elem in iter: data = elem['_source'] data['pathLength'] = elem['fields']['path_length'][0] if 'sessionID' in data: sessionID = data['sessionID'] if sessionID in allSessions: allSessions[sessionID].append(data) else: allSessions[sessionID] = [data] # This fixed sequence/interval logging that what was produced in # UserALE.js v 0.2.0 # Possible to remove self-loops here as well (html->html->html->window) := (html->window) intervalSessions = dict() for sessionID in allSessions: data = allSessions[sessionID] newData = [] intervalLog = [] pairs = zip(data, data[1:]) for curr, next in pairs: target1 = curr['target'] event1 = curr['type'] target2 = next['target'] event2 = next['type'] if target1 != target2: # ignore self-loops targetChange = int(True) eventChange = int(False) if event1 != event2: eventChange = int(True) # Starting over no matter what # Based off of curr, update the log curr['targetChange'] = targetChange curr['typeChange'] = eventChange curr['intervalCount'] = len( intervalLog) # some number maybe 0 if len(intervalLog) >= 2: # Calculate duration curr['duration'] = intervalLog[-1:]['clientTime'] - \ intervalLog[0]['clientTime'] else: curr['duration'] = 0 newData.append(curr) intervalLog = [] # else: # # They are the same # targetChange = int(False) # eventChange = int(False) # if event1 != event2: # eventChange = int(True) # # starting over # curr['targetChange'] = targetChange # curr['typeChange'] = eventChange # curr['intervalCount'] = len(intervalLog) # # if len(intervalLog) >= 2: # # # Calculate duration # # curr['duration'] = intervalLog[-1:]['clientTime'] - \ # # intervalLog[0]['clientTime'] # # else: # # curr['duration'] = 0 # newData.append(curr) # intervalLog = [] # else: # # increase counter # intervalLog.append(curr) intervalSessions[sessionID] = newData # return intervalSessions newSessions = [] # Generate all edges tied to a user # [ edge list, edge list, ... ] for k, v in intervalSessions.items(): pairs = pairwise(v) # list of edges for a user newSessions.append(pairs) # Node Map node_list = [] # Need to keep 0-based index for sankey diagram links = [] # Aggregate sequence list node_map = [] # Final node map {"name": "foo", "id": 0"} # Align the sequences alignment = itertools.izip_longest(*newSessions) src_ids = {} target_ids = {} for i, step in enumerate(alignment): # print(i) c = collections.Counter() visitedLinks = [] # visitedLinksUnique = set([]) nodenames = set([]) for edge in step: # for a single step look at all links if edge: node1 = edge[0] node2 = edge[1] session = node1['sessionID'] nodename1 = node1['target'] nodename2 = node2['target'] seqID = '%s->%s' % (nodename1, nodename2) #print(seqID) if nodename1 != nodename2: #double check again for self-loops #print(node1) link = { 'sequenceID': seqID, 'sourceName': nodename1, 'targetName': nodename2, 'type': node1['type'], 'duration': node1['duration'], 'pathLength': len(node1['path']) if node1['path'] is not None else 0, 'targetChange': node1['targetChange'], 'typeChange': node1['typeChange'] } visitedLinks.append(link) # Done with visits in a step. Now calculate counts counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID')) # print(counts) visitedLinksUnique = {v['sequenceID']: v for v in visitedLinks}.values() # print(visitedLinksUnique) # Visit unique links and generate src/targetid if len(node_map) == 0: for link in visitedLinksUnique: # Add all sources if link['sourceName'] not in src_ids: node_map.append({"name": link['sourceName']}) src_ids[link['sourceName']] = len(node_map) - 1 # Add all targets if link['targetName'] not in target_ids: node_map.append({"name": link['targetName']}) target_ids[link['targetName']] = len(node_map) - 1 else: src_ids = target_ids # sources were previous targets target_ids = {} for link in visitedLinksUnique: # Add all sources # if link['sourceName'] not in src_ids.values(): # node_map.append(link['sourceName']) # src_ids[len(node_map)-1] = link['sourceName'] # Add all targets if link['targetName'] not in target_ids: node_map.append({"name": link['targetName']}) target_ids[link['targetName']] = len(node_map) - 1 for link in visitedLinksUnique: # Perform lookup for ids # Perform lookup for counts link['source'] = src_ids[link['sourceName']] link['target'] = target_ids[link['targetName']] link['value'] = counts[link['sequenceID']] links.append(link) # for step in alignment: # # step through every users sequence # c = collections.Counter() # visitedLinks = [] # nodenames = set([]) # # # Process all the edges # for edge in step: # if edge: # node1 = edge[0] # node2 = edge[1] # # nodename1 = node1['target'] # nodename2 = node2['target'] # # # Add src and targetids # nodenames.add(nodename1) # nodenames.add(nodename2) # # # Generate sequence ID # seqID = '%s->%s' % (nodename1, nodename2) # # # @todo Ensure src and target are not the same (self-loop) # if nodename1 != nodename2: # link = { # 'sequenceID': seqID, # 'sourceName': nodename1, # 'targetName': nodename2, # 'type': node1['type'], # # 'duration': node1['duration'], # 'pathLength': len(node1['path']), # 'targetChange': node1['targetChange'], # 'typeChange': node1['typeChange'] # } # visitedLinks.append(link) # # # How many users visited a sequence at this step # counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID')) # # print(counts) # # Append into growing node_list # map(lambda x: node_list.append(x), nodenames) # # # map(lambda x: node_map.append({ "name": x} # # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames) # # map(lambda x: node_map.append({ "name": x}), nodenames) # # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames) # for v in visitedLinks: # # Pass through and update count, also generate src and target id # v['value'] = counts[v['sequenceID']] # # Last occurence is the src and target id # v['source'] = len(node_list) -1 - node_list[::-1].index(v['sourceName']) # v['target'] = len(node_list) -1 - node_list[::-1].index(v['targetName']) # links.append(v) # Save everything res = dict() res['histogram'] = generate_bargraph(allTargets) # res['sankey'] = { # # 'sessions': sessions, # 'links': links, # 'nodes': node_map # } res['nodes'] = node_map res['links'] = links res['sessions'] = sessions # with open('sankey.json', 'w') as outfile: # json.dump(res, outfile, sort_keys=False, indent=4) # with open('data.txt', 'w') as outfile: # json.dump(intervalSessions, outfile, indent=4, sort_keys=False) # # with open('query.json', 'w') as outfile: # json.dump(query, outfile, indent=4, sort_keys=False) # Iterate first to get nodes # pairs = pairwise(iter) # # nodes = [] # links = [] # for p in pairs: # node1 = p[0]['_source'] # node2 = p[1]['_source'] # # Append nodes to list # nodes.append(node1['target']) # nodes.append(node2['target']) # Iterate again to get edges # pairs = pairwise(iter2) # srcID = targetID = None # for p in pairs: # node1 = p[0]['_source'] # node2 = p[1]['_source'] # # # Append nodes to list # nodes.append(node1['target']) # # nodes.append(node2['target']) # # srcID = len(nodes) - 1 # targetID = len(nodes) # # # if (node1['target'] != node2['target']): # # Append links to list (remove self-loops) # link = { # 'sourceID': srcID, # 'targetID': targetID, # 'sourceName': node1['target'], # 'targetName': node2['target'], # 'type': node1['type'], # 'duration': node1['duration'], # 'value': node1['count'], # 'pathLength': len(node1['path']), # 'targetChange': int(node1['targetChange']), # 'typeChange': int(node1['typeChange']) # } # links.append(link) # # # Get all unique nodes # # node_names = np.unique(nodes).tolist() # node_list = [] # # for indx, name in enumerate(nodes): # n = {'id': indx, 'name': name} # node_list.append(n) # # # Remove self-loops # newLinks = [] # for indx, elem in enumerate(links): # srcID = elem['sourceID'] # targetID = elem['targetID'] # # if srcID != targetID: # newLinks.append(elem) # # return res