def train(self, trainSet, ruleType, ifPersist=True, datasize=0): counter = defaultdict(set) totalApps = set() for tbl, pkg in DataSetIter.iter_pkg(trainSet): if pkg.agent == 'None': continue map(lambda w: counter[w].add(pkg.app), filter(None, SPLITTER.split(utils.process_agent(pkg.agent)))) segAgent = tuple( ['^'] + filter(None, SPLITTER.split(utils.process_agent(pkg.agent))) + ['$']) self.HDB.append((segAgent, pkg.app, len(segAgent))) totalApps.add(pkg.app) self.omega = len(totalApps) * self.support_t self.totalApp = len(totalApps) * 1.0 self.HDB = list(set(self.HDB)) print("Data Size", len(self.HDB)) for (t, c, l) in self.HDB: map(lambda w: counter[w].add(c), t) self.IDF = utils.cal_idf(counter) self.mine_context() persist(Rules)
def classify(self, testSet): compressed = defaultdict(lambda: defaultdict(set)) for tbl, pkg in DataSetIter.iter_pkg(testSet): agent = tuple( ['^'] + filter(None, SPLITTER.split(utils.process_agent(pkg.agent))) + ['$']) compressed[agent][pkg.rawHost].add(pkg) batchPredicts, groundTruth = {}, {} for agent, host, pkgs in flatten(compressed): assert (type(pkgs) == set, "Type of pkgs is not correct" + str(type(pkgs))) predict = {} for ruleType in self.rules: predict[ruleType] = self.rules[ruleType].search(agent) for pkg in pkgs: batchPredicts[pkg.id] = predict groundTruth[pkg.id] = pkg.app if predict[consts.APP_RULE].label is not None and predict[ consts.APP_RULE].label != pkg.app: print('>>>[AGENT CLASSIFIER ERROR] agent:', pkg.agent, 'App:', pkg.app, 'Prediction:', predict[consts.APP_RULE]) return batchPredicts # , groundTruth
def __compare(rst, testSet): """ Compare predictions with test data set Input: :param rst : Predictions got from test. {pkgId : {ruleType : prediction}} :param testSet : Test data set. {pkgId : pacakge} :param testApps : Tested apps Output: - InforTrack : contains evaluation information """ P, groundT, T = defaultdict(set), defaultdict(set), defaultdict(int) for tbl, pkg in DataSetIter.iter_pkg(testSet): groundT[pkg.id] = pkg T[pkg.app] += 1 for pkgId, predictions in rst.items(): if predictions[consts.APP_RULE] is not None: label = predictions[consts.APP_RULE].label # label, vote = None, 0 # for l in predictions[consts.APP_RULE]: # if predictions[consts.APP_RULE][l] > vote: # vote = predictions[consts.APP_RULE][l] # label = l if label is not None: P[label].add(pkgId) total = len(groundT) correctApp, wrongApp, detectApp = set(), set(), set() accTP, accFP, accTN, accFN = 0, 0, 0, 0 for app, tNum in T.items(): TP, FP, TN, FN = 0, 0, 0, 0 pNum = len(P[app]) for pkgId in P[app]: if app == groundT[pkgId].app: TP += 1 if TP > 0 and TP == pNum: correctApp.add(app) elif pNum > 0 and TP != pNum: wrongApp.add(app) if pNum > 0: detectApp.add(app) FP = (pNum - TP) FN = (tNum - TP) TN = ((total - tNum) - FP) accTP += TP accFP += FP accTN += TN accFN += FN FPR = accFP / (accFP + accTN) TPR = accTP / (accTP + accFN) # Recall PPV = accTP / (accTP + accFP) # Precision print('Recall:', (accTP + accFN), '#', accTP) print('Precision:', (accTP + accFP), '#', accTP) return FPR, TPR, PPV, correctApp, wrongApp, detectApp
def cal_roc(rst, testSet): P, groundT, T = defaultdict(set), defaultdict(set), defaultdict(int) for tbl, pkg in DataSetIter.iter_pkg(testSet): groundT[pkg.id] = pkg T[pkg.app] += 1 scores = set() for pkgId, predictions in rst.items(): if predictions[consts.APP_RULE] is not None: predict = predictions[consts.APP_RULE] if predict.label is not None: P[predict.label].add((pkgId, predict.score)) scores.add(predict.score) total = len(groundT) roc = {} print("Start Construct ROC Curve ", len(scores)) for scoreT in scores: correctApp, wrongApp, detectApp = set(), set(), set() accTP, accFP, accTN, accFN = 0, 0, 0, 0 for app, tNum in T.items(): TP, FP, TN, FN = 0, 0, 0, 0 tmpP = [(pkgId, s) for (pkgId, s) in P[app] if s >= scoreT] pNum = len(tmpP) for pkgId, _ in tmpP: if app == groundT[pkgId].app: TP += 1 if TP > 0 and TP == pNum: correctApp.add(app) elif pNum > 0 and TP != pNum: wrongApp.add(app) if pNum > 0: detectApp.add(app) FP = (pNum - TP) FN = (tNum - TP) TN = ((total - tNum) - FP) accTP += TP accFP += FP accTN += TN accFN += FN FPR = accFP / (accFP + accTN) TPR = accTP / (accTP + accFN) # Recall PPV = accTP / (accTP + accFP) # Precision roc[scoreT] = (FPR, TPR, PPV) return roc
def __count(miner, check, ruleType): hostNodes = self.root.children.values() tmpR = defaultdict(set) for node in filter(miner.filter, hostNodes): features = miner.features(self.fLib, node.appInfos) if 'ohmychef' in node.feature: print '[uri128]', check(features, node.feature) if check(features, node.feature): tmpR[ruleType].add(node.feature) for tbl, pkg in DataSetIter.iter_pkg(trainSet): if pkg.host in tmpR[ruleType]: hostRules[ruleType][(pkg.rawHost, None, miner.label(pkg))].add(tbl)
def train(self, trainData, rule_type, ifPersist=True): rawHost = defaultdict(set) for tbl, pkg in DataSetIter.iter_pkg(trainData): rawHost[pkg.host].add(pkg.rawHost) features = get_f(pkg) self.__count(features, pkg.app) self.add(self.root, features[1:], pkg.appInfo, tbl) hostRules = self.__host_rules(trainData) pathRules = self.__path_rules(trainData) if ifPersist: self._persist(hostRules) self._persist(pathRules) return hostRules, pathRules
def train(self, trainData, ruleType): for datasize in [200000, 300000, 400000, 500000, 600000, 700000]: HDB = [(tbl, pkg) for tbl, pkg in DataSetIter.iter_pkg(trainData)] HDB = [HDB[i] for i in sorted(random.sample(xrange(len(HDB)), datasize))] print('Datasize:', len(HDB)) groups = defaultdict(list) for tbl, pkg in HDB: for host, key, value in get_f(pkg): groups[host].append((pkg.trackId, tbl, key, value, pkg.app)) print('Start mining frequent contexts') for host in groups: contexts = defaultdict(set) omega = set() seqs = defaultdict(set) sigAppMap = defaultdict(set); appSigMap = defaultdict(set); SC = defaultdict(set); date = defaultdict(set) for id, tbl, context, sig, app in groups[host]: omega.add(app) contexts[context].add(app) sigAppMap[sig].add(app) appSigMap[app].add(sig) SC[sig].add(id) date[(sig, app, context)].add(tbl) date[(app, context)].add(tbl) seqs[context].add(sig) for context, apps in contexts.items(): support = len(apps) * 1.0 / len(omega) if support > conf.query_labelT: effective = 0 for sig in seqs[context]: if len(sigAppMap[sig]) == 1: app = list(sigAppMap[sig])[0] rel = 1.0 / len(appSigMap[app]) dateScore = 1.0 * len(date[(sig, app, context)]) / len(date[(app, context)]) effective += rel * dateScore effective = effective / len(sigAppMap) quality = 2 * effective * support / (effective + support) if quality > conf.query_scoreT: for sig in seqs[context]: if len(sigAppMap[sig]) == 1: app = list(sigAppMap[sig])[0] rel = 1.0 / len(appSigMap[app]) dateScore = 1.0 * len(date[(sig, app, context)]) / len(date[(app, context)]) sigQ = rel * dateScore for id in SC[sig]: self.Rules[id].append((quality, sigQ, consts, host, sig)) self.Rules[i] = sorted(self.Rules[i], key=lambda x: (x[1], x[2]), reverse=True)[:conf.query_K]
def __path_rules(self, trainSet): pathRules = defaultdict(lambda: defaultdict(set)) tmpR = defaultdict(set) for pathSeg, labels in filter(lambda x: len(x[1]) == 1, self.pathLabel.iteritems()): label = list(labels)[0] fs = self.fLib[consts.APP_RULE][label] ifValid = part(fs, pathSeg) if ifValid: tmpR[consts.APP_RULE].add(pathSeg) for tbl, pkg in DataSetIter.iter_pkg(trainSet): pkgFs = set(get_f(pkg)[2:]) for pathSeg in tmpR[consts.APP_RULE]: if pathSeg in pkgFs: pathRules[consts.APP_RULE][(pkg.rawHost, pathSeg, pkg.label)].add(tbl) return pathRules
def train(self, trainSet, ruleType, datasize): counter = defaultdict(set) totalApps = set() for tbl, pkg in DataSetIter.iter_pkg(trainSet): if pkg.path == 'None': continue map(lambda w: counter[w].add(pkg.app), filter(None, pkg.path.split('/'))) segPath = tuple(['^'] + filter(None, pkg.path.split('/')) + ['$']) host = re.sub('[0-9]+\.', '[0-9]+.', pkg.rawHost) self.HDB.append((segPath, pkg.app, len(segPath), host)) totalApps.add(pkg.app) self.omega = len(totalApps) * self.support_t self.totalApp = len(totalApps) * 1.0 #self.HDB = list(set(self.HDB)) self.HDB = [ self.HDB[i] for i in sorted(random.sample(xrange(len(self.HDB)), datasize)) ] print("Data Size", len(self.HDB)) groups = defaultdict(list) for (t, c, l, host) in self.HDB: groups[host].append((t, c, l)) for host in groups: omega = set() for (t, c, l) in groups[host]: omega.add(c) if len(omega) == 1: groups[host] = [] print('skipped', omega, host) for host in groups: self.host = host self.HDB = groups[host] for (t, c, l) in self.HDB: map(lambda w: counter[w].add(c), t) self.IDF = utils.cal_idf(counter) self.mine_context()