class TextFormInfer: def __init__(self, messages): self.clser = TextClassify(messages) self.httptuning = HttpDataTuning() self.ftptuning = FTPDataTuning() self.redistuning = RedisDataTuning() def ldaFormatInfer(self, wSize, TK, wLen, Kcls, infercls='H'): clsDatas = self.clser.clsMessages(wSize, TK, wLen, Kcls) clsFormats = [] formatInfer = Format() clusters = [cluster for cluster in clsDatas.values()] if infercls == 'H': self.httptuning.getMsgsLen(clusters) elif infercls == 'F': self.ftptuning.getMsgsLen(clusters) else: self.redistuning.getMsgsLen(clusters) for clsData in clsDatas.values(): tMessages = [RawMessage(message) for message in clsData] tempFormat = Symbol(messages=tMessages) formatInfer.splitAligned(tempFormat, doInternalSlick=True) clsFormats.append(tempFormat) return clsFormats def ladDbscanFormatInfer(self, wSize, TK, wLen, mindis, minpt, infercls): clsDatas = self.clser.clsByDbscan(wSize, TK, wLen, mindis, minpt) clusters = [cluster for cluster in clsDatas.values()] if infercls == 'H': self.httptuning.getMsgsLen(clusters) elif infercls == 'F': self.ftptuning.getMsgsLen(clusters) else: self.redistuning.getMsgsLen(clusters) clsFormats = [] formatInfer = Format() for clsData in clsDatas.values(): tMessages = [RawMessage(message) for message in clsData] tempFormat = Symbol(messages=tMessages) formatInfer.splitAligned(tempFormat, doInternalSlick=True) clsFormats.append(tempFormat) return clsFormats
class TextClassifyLogic: def __init__(self, messages, tRate, sRate, wRate, wHeight): self.tRate = tRate self.srate = sRate self.wRate = self.srate self.wHeight = wHeight self.messages = messages self.httpData = HttpDataTuning() self.ftpData = FTPDataTuning() self.redisData = RedisDataTuning() def GetLocData(self, datas): nowLocData = [] for data in datas: nowLocData.append(data.now()) return nowLocData def filterShort(self, freWords, h): newFreWords = set() for value in freWords: if (len(value) >= h): newFreWords.add(value) return newFreWords def GetFrequentWords(self, rate, h, datas): nowLocDatas = self.GetLocData(datas) Datas = [str(data) for data in nowLocDatas] freWords = ApriorFreAnalyZer(Datas, rate).getApriorFre() freWords = self.filterShort(freWords, h) return freWords def RankWord(self, word, datas): nowLocDatas = self.GetLocData(datas) Datas = [str(data) for data in nowLocDatas] cnt = 0 loc = 0 for data in Datas: tempLoc = data.find(word) if tempLoc != -1: cnt = cnt + 1 loc = loc + tempLoc print(word, cnt, loc) return (cnt, (loc + 1) / cnt) def RankWords(self, freWords, datas): words = [] for freWord in freWords: nums = self.RankWord(freWord, datas) words.append((freWord, nums[0], nums[1])) words = BaseRankModel.sortList(words) return words def ConvertFreWords(self, data): freSet = {} for freWord in self.freWords: lo = data.find(freWord) if lo != -1: freSet[freWord] = lo frePattern = sorted(freSet.items(), key=lambda key: key[1]) finalPattern = ''.join([item[0] for item in frePattern]) return finalPattern def GetWodsRank(self, datas): freWords = self.GetFrequentWords(self.wRate, self.wHeight, datas) rankWords = self.RankWords(freWords, datas) return rankWords def ClassifyMessages(self, messages): msgSet = {} for message in messages: freWord = self.ConvertFreWords(str(message.message)) if freWord not in msgSet: msgSet[freWord] = [] msgSet[freWord].append(message) return msgSet def ClassifyCircleLy(self, preWords, messages): rankWords = self.GetWodsRank(messages) funCode = None for word in rankWords: if word not in preWords and word[1] / len( self.messages) > self.tRate and word[1] != len(messages): funCode = word break fResult = [] print(funCode, len(messages)) #if funCode is not None and funCode[1] / len(messages) > self.trate: if funCode is not None: clsOne, clsTwo = self.ClassifyByCodes(funCode[0], messages) print(len(clsOne), len(clsTwo)) if len(clsTwo) / len(self.messages) > self.tRate: fResult.append(self.ClassifyCircleLy(preWords, clsTwo)) else: if len(clsTwo) > 0: fResult.append(clsTwo) if len(clsOne) / len(self.messages) > self.tRate: preWords.add(funCode) fResult.append(self.ClassifyCircleLy(preWords, clsOne)) preWords.remove(funCode) else: fResult.append(clsOne) else: fResult = messages return fResult def ClassifyByCodes(self, codes, messages): clsTwo = [] clsOne = [] for message in messages: value = str(message.now()) if value.find(codes) != -1: clsOne.append(message) else: clsTwo.append(message) return (clsOne, clsTwo) def FormatInfer(self, rate, h): self.GetFrequentWords(rate, h) messageClassify = self.ClassifyMessages(self.datas) finalFormats = [] formatInfer = Format() for key, value in messageClassify.items(): tMessages = [] for message in value: singleMessage = RawMessage(message.message) tMessages.append(singleMessage) tempFormat = Symbol(messages=tMessages) formatInfer.splitAligned(tempFormat, doInternalSlick=True) finalFormats.append(tempFormat) return finalFormats def FormatInferCirclely(self, messages, Mtype): preFre = set() #result = textClassify.classifyMessages(preFre, messages) result = self.classifyMessages(preFre, messages) clsResult = [] for res in result: clsr = [] for msg in res: clsr.append(msg.message) clsResult.append(clsr) if Mtype == 'H': self.httpData.getMsgsLen(clsResult) elif Mtype == 'F': self.ftpData.getMsgsLen(clsResult) else: self.redisData.getMsgsLen(clsResult) #httpTuning = HttpDataTuning() #print(httpTuning.getMsgsLen(clsResult)) #ftpTuning = FTPDataTuning() #print(ftpTuning.getMsgsLen(clsResult)) #redisTuning = RedisDataTuning() #redisTuning.getMsgsLen(clsResult) finalFormats = [] formatInfer = Format() for dataList in result: tMessages = [] for data in dataList: singMessage = RawMessage(data.message) tMessages.append(singMessage) tempFormat = Symbol(messages=tMessages) formatInfer.splitAligned(tempFormat, doInternalSlick=True) finalFormats.append(tempFormat) return finalFormats def formatInfer(self, messages): preFre = set() result = self.classifyMessages(preFre, messages) finalFormats = [] formatInfer = Format() for dataList in result: tMessages = [] for data in dataList: singMessage = RawMessage(data.message) tMessages.append(singMessage) tempFormat = Symbol(messages=tMessages) formatInfer.splitAligned(tempFormat, doInternalSlick=True) finalFormats.append(tempFormat) return finalFormats def filterSets(self, result, fResult): cverter = Converter() cverter.ConvertMultiListPure(result, fResult) def classifyMessages(self, preSet, messages): datas = self.ClassifyCircleLy(preSet, messages) result = [] self.filterSets(datas, result) return result