def get_avg_vectors(self): ''' (self) -> None Create a 30-day rolling average of log statistics to date ''' month_vecs = [] grouped_vecs = {} # create a list of 31 dates, starting the day before # yesterday since yesterday is the test data set dates = [] dates.append( datetime.now(pytz.timezone(esconstants.TIMEZONE)) - timedelta(days=(esconstants.TEST_DAY + 1))) # Monday = 0, Sunday = 6 dayofweek = (datetime.now(pytz.timezone(esconstants.TIMEZONE)) - timedelta(days=(esconstants.TEST_DAY + 1))).weekday() for i in range(1, esconstants.NUM_DAYS): dates.append(dates[0] - timedelta(days=i)) # create argument namespace for ESQuery args = argparse.Namespace() # perform elasticsearch query query = ESQuery() args.action = 'stringquery' args.count = False args.fields = None args.host = esconstants.ES_HOST args.index = 'logstash*' args.list = False args.query = None args.size = esconstants.ES_QUERY_SIZE args.terms = ['*'] # get 30 day's worth of logs for adate in dates: # create date range for single day, pass to esquery args.range = [esconstants.KEY_ES_TIMESTAMP,\ str(adate.isoformat()[:10]),\ str(adate.isoformat()[:10])] query.post(args) result = query.response.json() # skip empty results if result[esconstants.KEY_ES_SEARCH][esconstants.KEY_ES_TOTAL] \ == 0: print('Dropped day: ' + str(adate.isoformat()[:10])) # decrement day dayofweek = (dayofweek - 1) % 7 continue self._result_data = result self._count_day() # append day's results to month print('Got day: ' + str(adate.isoformat()[:10])) month_vecs.append(dayofweek) month_vecs.append(self._result_vectors) # decrement day dayofweek = (dayofweek - 1) % 7 print() # generate a single array containing avg of each day in past 30 # first we need to group all like vectors by day, type, subtype day = 0 for i, elt in enumerate(month_vecs): # add days to structure, every other element of list is a day if i % 2 == 0: if elt in grouped_vecs: day = elt continue else: day = elt grouped_vecs[elt] = {} else: # organize the structure by type and subtype grouped_vecs[day] = self._organize_vectors( elt, grouped_vecs[day]) # calculate average scalars by day, type, subtype for day in grouped_vecs.keys(): for type_key in grouped_vecs[day].keys(): for subtype in grouped_vecs[day][type_key].keys(): # if only one line don't average if len(grouped_vecs[day][type_key][subtype].shape) == 1: continue else: grouped_vecs[day][type_key][subtype] = \ numpy.average(grouped_vecs[day] [type_key][subtype], axis=0) self.grouped_vecs = grouped_vecs
def get_avg_vectors(self): ''' (self) -> None Create a 30-day rolling average of log statistics to date ''' month_vecs = [] grouped_vecs = {} # create a list of 31 dates, starting the day before # yesterday since yesterday is the test data set dates = [] dates.append(datetime.now(pytz.timezone(esconstants.TIMEZONE)) - timedelta(days=(esconstants.TEST_DAY + 1))) # Monday = 0, Sunday = 6 dayofweek = (datetime.now(pytz.timezone(esconstants.TIMEZONE)) - timedelta(days=(esconstants.TEST_DAY + 1))).weekday() for i in range(1, esconstants.NUM_DAYS): dates.append(dates[0] - timedelta(days=i)) # create argument namespace for ESQuery args = argparse.Namespace() # perform elasticsearch query query = ESQuery() args.action = 'stringquery' args.count = False args.fields = None args.host = esconstants.ES_HOST args.index = 'logstash*' args.list = False args.query = None args.size = esconstants.ES_QUERY_SIZE args.terms = ['*'] # get 30 day's worth of logs for adate in dates: # create date range for single day, pass to esquery args.range = [esconstants.KEY_ES_TIMESTAMP,\ str(adate.isoformat()[:10]),\ str(adate.isoformat()[:10])] query.post(args) result = query.response.json() # skip empty results if result[esconstants.KEY_ES_SEARCH][esconstants.KEY_ES_TOTAL] \ == 0: print('Dropped day: ' + str(adate.isoformat()[:10])) # decrement day dayofweek = (dayofweek - 1) % 7 continue self._result_data = result self._count_day() # append day's results to month print('Got day: ' + str(adate.isoformat()[:10])) month_vecs.append(dayofweek) month_vecs.append(self._result_vectors) # decrement day dayofweek = (dayofweek - 1) % 7 print() # generate a single array containing avg of each day in past 30 # first we need to group all like vectors by day, type, subtype day = 0 for i, elt in enumerate(month_vecs): # add days to structure, every other element of list is a day if i % 2 == 0: if elt in grouped_vecs: day = elt continue else: day = elt grouped_vecs[elt] = {} else: # organize the structure by type and subtype grouped_vecs[day] = self._organize_vectors(elt, grouped_vecs[day]) # calculate average scalars by day, type, subtype for day in grouped_vecs.keys(): for type_key in grouped_vecs[day].keys(): for subtype in grouped_vecs[day][type_key].keys(): # if only one line don't average if len(grouped_vecs[day][type_key][subtype].shape) == 1: continue else: grouped_vecs[day][type_key][subtype] = \ numpy.average(grouped_vecs[day] [type_key][subtype], axis=0) self.grouped_vecs = grouped_vecs
args = argparse.Namespace() args.action = 'stringquery' args.count = False args.fields = None args.host = esconstants.ES_HOST args.index = 'logstash*' args.list = False args.query = None args.range = [esconstants.KEY_ES_TIMESTAMP, str(day), str(day)] # get all the things args.size = 1000000 args.terms = ['*'] # perform elasticsearch query query = ESQuery() query.post(args) response = query.response # stuff data in normalization class r = response.json() # load vectorize object with day's results day = ESVectorize(r) # obtain vectors for the day and training data day.get_day_vectors() day.get_avg_vectors() # now analyze and compare result = ESAnalyze(day.day_vecs, day.grouped_vecs, plot=False) result.data_clean() result.outlier_detection()