def filter(log, dictionary=None): if dictionary == None: dictionary = set() for line in log: for word in line["MESSAGE"].split(" "): dictionary.add(word) TIMESTAMP_FORMAT = "%Y-%m-%d-%H.%M.%S.%f" result = [] step = len(log)/1000 percent = -0.1 for i in xrange(0,len(log)-1): if i % step == 0: percent += 0.1 print percent, "% complete" if (log[i]['CAT'] == "ignore"): continue else: result.append(log[i]) logIDate = datetime.strptime(log[i]['EVENT_TIME'], TIMESTAMP_FORMAT) for j in xrange(i+1,len(log)-1): if (log[j]['CAT'] == "ignore"): continue logJDate = datetime.strptime(log[j]['EVENT_TIME'], TIMESTAMP_FORMAT) timeDiff = (logJDate - logIDate).total_seconds() if (timeDiff > 20 * 60): break requiredCorr = getRequiredCorrelation(timeDiff) corr = PearsonCorrelation.correlation(log[i], log[j], dictionary) if (corr > requiredCorr): log[j]['CAT'] = "ignore" # deleting this log return result
def testSomewhatSimilarLogs(self): log1 = {'MESSAGE':"hello I am a log file"} log2 = {'MESSAGE':"hello I am something else"} dictionary = set(["hello", "I", "am", "a", "log", "file", "these", "are", "some", "other", "words", "something", "else"]) corr = PearsonCorrelation.correlation(log1, log2, dictionary) self.assertGreater(corr, 0.0)
def testUnequalLogs(self): log1 = {'MESSAGE':"hello I am a log file"} log2 = {'MESSAGE':"Bobby likes ice cream"} dictionary = set(["hello", "I", "am", "a", "log", "file", "these", "are", "some", "other", "words", "Bobby", "likes", "ice", "cream"]) corr = PearsonCorrelation.correlation(log1, log2, dictionary) self.assertLess(corr, 0.0)
def testEqualUnorderedLogs(self): log1 = {'MESSAGE':"hello I am a log file"} log2 = {'MESSAGE':"I log a file hello am log file"} dictionary = set(["hello", "I", "am", "a", "log", "file", "these", "are", "some", "other", "words"]) corr = PearsonCorrelation.correlation(log1, log2, dictionary) self.assertEqual(corr, 1.0)