def queryAvgValuebyHost(idx, fromDate, toDate): val_fld = hp.getValueField(idx) def runQuery(fld): query = { "size": 0, "query": { "bool": { "must": [ { "range": { "timestamp": { "gte": fromDate, "lte": toDate } } } ] } }, "aggs": { "host": { "terms": { "field": fld, "size": 9999 }, "aggs": { "period": { "date_histogram": { "field": "timestamp", "calendar_interval": "day" }, "aggs": { val_fld: { "avg": { "field": val_fld } } } } } } } } return hp.es.search(index=idx, body=query) result = {} for ft in ['src_host', 'dest_host']: data = runQuery(ft) temp = [] for host in data['aggregations']['host']['buckets']: for period in host['period']['buckets']: temp.append({'host': host['key'], 'period': period['key'],val_fld: period[val_fld]['value']}) result[ft] = temp return result
def getData(self, src, dest): time_list = hp.GetTimeRanges(self.root_parent.dateFrom, self.root_parent.dateTo) df = pd.DataFrame(qrs.queryAllValues(self._idx, src, dest, time_list)) df.rename(columns={hp.getValueField(self._idx): 'value'}, inplace=True) if len(df) > 0: df['log_value'] = np.log(df['value'].replace(0, np.nan)) df['sqrt'] = df['value']**(1 / 2) return df
def getValues(self, probdf): # probdf = markNodes() df = pd.DataFrame(columns=['timestamp', 'value', 'idx', 'hash']) time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo) for item in probdf[['src', 'dest', 'idx']].values: tempdf = pd.DataFrame( qrs.queryAllValues(item[2], item, time_list[0], time_list[1])) tempdf['idx'] = item[2] tempdf['hash'] = item[0] + "-" + item[1] tempdf['src'] = item[0] tempdf['dest'] = item[1] tempdf.rename(columns={hp.getValueField(item[2]): 'value'}, inplace=True) df = df.append(tempdf, ignore_index=True) return df
def queryAllValuesFromList(idx, fld_type, val_list, period): val_fld = hp.getValueField(idx) query = { "size": 0, "_source": { "includes": ["timestamp", "dest", "src", val_fld] }, "query": { "bool": { "must": [ { "range": { "timestamp": { "gte": period[0], "lte": period[1] } } }, { "terms": { fld_type: val_list } } ] } } } data = scan(client=es, index=idx, query=query, _source=["timestamp", "dest", "src", val_fld], filter_path=['_scroll_id', '_shards', 'hits.hits._source']) # scan(client=hp.es, index=idx, query=query) # print(idx, str(query).replace("\'", "\"")) count = 0 allData=[] for res in data: if not count%100000: print(count) allData.append(res['_source']) count=count+1 return allData
def PairAverageValuesQuery(idx, time_from, time_to, args): src = args[0] dest = args[1] field = hp.getValueField(idx) def runQuery(src, dest): query = { "size" : 0, "query" : { "bool" : { "must" : [ { "term" : { "src_host" : { "value" : src } } }, { "term" : { "dest_host" : { "value" : dest } } }, { "range" : { "timestamp" : { "from" : time_from, "to" : time_to } } } ] } }, "_source" : False, "aggregations" : { "groupby" : { "composite" : { "size" : 10000, "sources" : [ { "src_host" : { "terms" : { "field" : "src_host", "missing_bucket" : True, "order" : "asc" } } }, { "dest_host" : { "terms" : { "field" : "dest_host", "missing_bucket" : True, "order" : "asc" } } }, { "ipv6" : { "terms" : { "field" : "ipv6", "missing_bucket" : True, "order" : "asc" } } } ] }, "aggregations" : { "ts": { "date_histogram": { "field": "timestamp", "fixed_interval": "30m" }, "aggs": { field: { "avg": { "field": field } } } } } } } } results = hp.es.search(index=idx, body=query) res = [] for item in results["aggregations"]["groupby"]["buckets"]: for p in item['ts']['buckets']: res.append({'ipv6':item['key']['ipv6'], 'ts': p['key'], field: p[field]['value'], 'doc_count': p['doc_count']}) return res # In case an IP was replaced by the host name at a previous step we need to find all possible values for the given hosts and run the query with each combination of the two data = [] src_items, dest_items = [], [] for k,v in hp.hosts.items(): if src == v: src_items.append(k) if dest == v: dest_items.append(k) combinations = list(itertools.product(src_items, dest_items)) print(combinations) for c in combinations: output = runQuery(c[0], c[1]) if len(output) > 0: data.extend(output) return data
def GetPairsForAHostV1(idx, time_from, time_to, args): host = args[0] field = hp.getValueField(idx) print(host, field, idx, time_from, time_to) # def runQuery(host): query = { "size" : 0, "query" : { "bool" : { "must" : [ { "bool" : { "should" : [ { "term" : { "src_host" : { "value" : host } } }, { "term" : { "dest_host" : { "value" : host } } } ] } }, { "range" : { "timestamp" : { "from" : time_from, "to" : time_to } } } ] } }, "_source" : False, "aggregations" : { "groupby" : { "composite" : { "size" : 9999, "sources" : [ { "src_host" : { "terms" : { "field" : "src_host", "missing_bucket" : True, "order" : "asc" } } }, { "dest_host" : { "terms" : { "field" : "dest_host", "missing_bucket" : True, "order" : "asc" } } } ] }, "aggregations" : { field : { "avg" : { "field" : field } } } } } } print(str(query).replace("\'", "\"")) results = hp.es.search(index=idx, body=query) res = [] for item in results["aggregations"]["groupby"]["buckets"]: res.append({'src_host':item['key']['src_host'], 'dest_host':item['key']['dest_host'], field: item[field]['value']}) return res
def GetPairsForAHost(idx, time_from, time_to, args): host = args[0] field = hp.getValueField(idx) print(host, field, idx, time_from, time_to) def runQuery(host): query = { "size" : 0, "query" : { "bool" : { "must" : [ { "bool" : { "should" : [ { "term" : { "src_host" : { "value" : host } } }, { "term" : { "dest_host" : { "value" : host } } } ] } }, { "range" : { "timestamp" : { "from" : time_from, "to" : time_to } } } ] } }, "_source" : False, "aggregations" : { "groupby" : { "composite" : { "size" : 9999, "sources" : [ { "src_host" : { "terms" : { "field" : "src_host", "missing_bucket" : True, "order" : "asc" } } }, { "dest_host" : { "terms" : { "field" : "dest_host", "missing_bucket" : True, "order" : "asc" } } } ] }, "aggregations" : { field : { "avg" : { "field" : field } } } } } } results = hp.es.search(index=idx, body=query) res = [] for item in results["aggregations"]["groupby"]["buckets"]: res.append({'src_host':item['key']['src_host'], 'dest_host':item['key']['dest_host'], field: item[field]['value']}) return res data, host_items = [], [] for k,v in hp.hosts.items(): if host == v: output = runQuery(k) if len(output) > 0: data.extend(output) return data
def queryDailyAvg(idx, fld, dateFrom, dateTo): val_fld = hp.getValueField(idx) query = { "size": 0, "query": { "bool": { "must": [ { "range": { "timestamp": { "gte": dateFrom, "lte": dateTo } } }, { "term" : { "src_production" : True } }, { "term" : { "dest_production" : True } } ] } }, "aggs": { "avg_values": { "terms": { "field": fld, "size": 9999 }, "aggs": { "period": { "date_histogram": { "field": "timestamp", "calendar_interval": "day" }, "aggs": { val_fld: { "avg": { "field": val_fld } } } } } } } } data = hp.es.search(index=idx, body=query) result = {} # i = 0 for ip in data['aggregations']['avg_values']['buckets']: temp = {} for period in ip['period']['buckets']: temp[period['key']] = period[val_fld]['value'] result[ip['key']] = temp return result
def query4Avg(idx, dateFrom, dateTo): val_fld = hp.getValueField(idx) query = { "size" : 0, "query" : { "bool" : { "must" : [ { "range" : { "timestamp" : { "gt" : dateFrom, "lte": dateTo } } }, { "term" : { "src_production" : True } }, { "term" : { "dest_production" : True } } ] } }, "aggregations" : { "groupby" : { "composite" : { "size" : 9999, "sources" : [ { "src" : { "terms" : { "field" : "src" } } }, { "dest" : { "terms" : { "field" : "dest" } } } ] }, "aggs": { val_fld: { "avg": { "field": val_fld } } } } } } # print(idx, str(query).replace("\'", "\"")) aggrs = [] aggdata = hp.es.search(index=idx, body=query) for item in aggdata['aggregations']['groupby']['buckets']: aggrs.append({'hash': str(item['key']['src']+'-'+item['key']['dest']), 'src': item['key']['src'], 'dest': item['key']['dest'], 'value': item[val_fld]['value'], 'from': dateFrom, 'to': dateTo, 'doc_count': item['doc_count'] }) return aggrs
def queryAllValues(idx, src, dest, period): val_fld = hp.getValueField(idx) query = { "size": 0, "_source": ["timestamp", val_fld], "sort": [ { "timestamp": { "order": "asc" } } ], "query": { "bool": { "must": [ { "range": { "timestamp": { "gte": period[0], "lte": period[1] } } }, { "term" : { "src" : { "value" : src } } }, { "term" : { "dest" : { "value" : dest } } }, { "term" : { "src_production" : True } }, { "term" : { "dest_production" : True } } ] } } } data = scan(client=hp.es, index=idx, query=query) # print(idx, str(query).replace("\'", "\"")) count = 0 allData=[] for res in data: if not count%100000: print(count) allData.append(res['_source']) count=count+1 return allData
def AggBySrcDestIP(idx, time_from, time_to): val_fld = hp.getValueField(idx) query = { "size" : 0, "_source" : False, "query" : { "range" : { "timestamp" : { "from" : time_from, "to" : time_to } } }, "aggregations" : { "groupby" : { "composite" : { "size" : 10000, "sources" : [ { "src_host" : { "terms" : { "field" : "src_host", "missing_bucket" : True, "order" : "asc" } } }, { "ipv6" : { "terms" : { "field" : "ipv6", "missing_bucket" : True, "order" : "asc" } } }, { "dest_host" : { "terms" : { "field" : "dest_host", "missing_bucket" : True, "order" : "asc" } } } ] }, "aggregations" : { "mean_field" : { "avg" : { "field" : val_fld } } } } } } results = hp.es.search(index=idx, body=query) data = [] for item in results["aggregations"]["groupby"]["buckets"]: data.append({'dest_host':item['key']['dest_host'], 'src_host':item['key']['src_host'], 'ipv6':item['key']['ipv6'], val_fld: item['mean_field']['value'], 'num_tests': item['doc_count']}) return data