Esempio n. 1
0
    def train(self, trainSet, ruleType, ifPersist=True, datasize=0):
        counter = defaultdict(set)
        totalApps = set()
        for tbl, pkg in DataSetIter.iter_pkg(trainSet):
            if pkg.agent == 'None':
                continue
            map(lambda w: counter[w].add(pkg.app),
                filter(None, SPLITTER.split(utils.process_agent(pkg.agent))))
            segAgent = tuple(
                ['^'] +
                filter(None, SPLITTER.split(utils.process_agent(pkg.agent))) +
                ['$'])
            self.HDB.append((segAgent, pkg.app, len(segAgent)))
            totalApps.add(pkg.app)

        self.omega = len(totalApps) * self.support_t
        self.totalApp = len(totalApps) * 1.0

        self.HDB = list(set(self.HDB))
        print("Data Size", len(self.HDB))

        for (t, c, l) in self.HDB:
            map(lambda w: counter[w].add(c), t)
        self.IDF = utils.cal_idf(counter)
        self.mine_context()
        persist(Rules)
Esempio n. 2
0
    def classify(self, testSet):
        compressed = defaultdict(lambda: defaultdict(set))
        for tbl, pkg in DataSetIter.iter_pkg(testSet):
            agent = tuple(
                ['^'] +
                filter(None, SPLITTER.split(utils.process_agent(pkg.agent))) +
                ['$'])
            compressed[agent][pkg.rawHost].add(pkg)

        batchPredicts, groundTruth = {}, {}
        for agent, host, pkgs in flatten(compressed):
            assert (type(pkgs) == set,
                    "Type of pkgs is not correct" + str(type(pkgs)))
            predict = {}
            for ruleType in self.rules:
                predict[ruleType] = self.rules[ruleType].search(agent)

            for pkg in pkgs:
                batchPredicts[pkg.id] = predict
                groundTruth[pkg.id] = pkg.app
                if predict[consts.APP_RULE].label is not None and predict[
                        consts.APP_RULE].label != pkg.app:
                    print('>>>[AGENT CLASSIFIER ERROR] agent:', pkg.agent,
                          'App:', pkg.app, 'Prediction:',
                          predict[consts.APP_RULE])
        return batchPredicts  # , groundTruth
Esempio n. 3
0
def __compare(rst, testSet):
    """
      Compare predictions with test data set
      Input:
      :param  rst : Predictions got from test. {pkgId : {ruleType : prediction}}
      :param  testSet : Test data set. {pkgId : pacakge}
      :param  testApps : Tested apps
      Output:
      - InforTrack : contains evaluation information
    """
    P, groundT, T = defaultdict(set), defaultdict(set), defaultdict(int)

    for tbl, pkg in DataSetIter.iter_pkg(testSet):
        groundT[pkg.id] = pkg
        T[pkg.app] += 1
    for pkgId, predictions in rst.items():
        if predictions[consts.APP_RULE] is not None:
            label = predictions[consts.APP_RULE].label
            # label, vote = None, 0
            # for l in predictions[consts.APP_RULE]:
            #     if predictions[consts.APP_RULE][l] > vote:
            #         vote = predictions[consts.APP_RULE][l]
            #         label = l
            if label is not None:
                P[label].add(pkgId)

    total = len(groundT)
    correctApp, wrongApp, detectApp = set(), set(), set()
    accTP, accFP, accTN, accFN = 0, 0, 0, 0
    for app, tNum in T.items():
        TP, FP, TN, FN = 0, 0, 0, 0
        pNum = len(P[app])
        for pkgId in P[app]:
            if app == groundT[pkgId].app:
                TP += 1

        if TP > 0 and TP == pNum:
            correctApp.add(app)
        elif pNum > 0 and TP != pNum:
            wrongApp.add(app)
        if pNum > 0:
            detectApp.add(app)

        FP = (pNum - TP)
        FN = (tNum - TP)
        TN = ((total - tNum) - FP)

        accTP += TP
        accFP += FP
        accTN += TN
        accFN += FN

    FPR = accFP / (accFP + accTN)
    TPR = accTP / (accTP + accFN)  # Recall
    PPV = accTP / (accTP + accFP)  # Precision
    print('Recall:', (accTP + accFN), '#', accTP)
    print('Precision:', (accTP + accFP), '#', accTP)
    return FPR, TPR, PPV, correctApp, wrongApp, detectApp
Esempio n. 4
0
def cal_roc(rst, testSet):
    P, groundT, T = defaultdict(set), defaultdict(set), defaultdict(int)

    for tbl, pkg in DataSetIter.iter_pkg(testSet):
        groundT[pkg.id] = pkg
        T[pkg.app] += 1

    scores = set()
    for pkgId, predictions in rst.items():
        if predictions[consts.APP_RULE] is not None:
            predict = predictions[consts.APP_RULE]
            if predict.label is not None:
                P[predict.label].add((pkgId, predict.score))
                scores.add(predict.score)

    total = len(groundT)
    roc = {}
    print("Start Construct ROC Curve ", len(scores))
    for scoreT in scores:
        correctApp, wrongApp, detectApp = set(), set(), set()
        accTP, accFP, accTN, accFN = 0, 0, 0, 0
        for app, tNum in T.items():
            TP, FP, TN, FN = 0, 0, 0, 0
            tmpP = [(pkgId, s) for (pkgId, s) in P[app] if s >= scoreT]
            pNum = len(tmpP)
            for pkgId, _ in tmpP:
                if app == groundT[pkgId].app:
                    TP += 1

            if TP > 0 and TP == pNum:
                correctApp.add(app)
            elif pNum > 0 and TP != pNum:
                wrongApp.add(app)
            if pNum > 0:
                detectApp.add(app)

            FP = (pNum - TP)
            FN = (tNum - TP)
            TN = ((total - tNum) - FP)

            accTP += TP
            accFP += FP
            accTN += TN
            accFN += FN

        FPR = accFP / (accFP + accTN)
        TPR = accTP / (accTP + accFN)  # Recall
        PPV = accTP / (accTP + accFP)  # Precision
        roc[scoreT] = (FPR, TPR, PPV)
    return roc
Esempio n. 5
0
        def __count(miner, check, ruleType):
            hostNodes = self.root.children.values()
            tmpR = defaultdict(set)
            for node in filter(miner.filter, hostNodes):
                features = miner.features(self.fLib, node.appInfos)
                if 'ohmychef' in node.feature:
                    print '[uri128]', check(features, node.feature)
                if check(features, node.feature):
                    tmpR[ruleType].add(node.feature)

            for tbl, pkg in DataSetIter.iter_pkg(trainSet):
                if pkg.host in tmpR[ruleType]:
                    hostRules[ruleType][(pkg.rawHost, None,
                                         miner.label(pkg))].add(tbl)
Esempio n. 6
0
    def train(self, trainData, rule_type, ifPersist=True):
        rawHost = defaultdict(set)
        for tbl, pkg in DataSetIter.iter_pkg(trainData):
            rawHost[pkg.host].add(pkg.rawHost)
            features = get_f(pkg)
            self.__count(features, pkg.app)
            self.add(self.root, features[1:], pkg.appInfo, tbl)

        hostRules = self.__host_rules(trainData)
        pathRules = self.__path_rules(trainData)

        if ifPersist:
            self._persist(hostRules)
            self._persist(pathRules)
        return hostRules, pathRules
Esempio n. 7
0
    def train(self, trainData, ruleType):
        for datasize in [200000, 300000, 400000, 500000, 600000, 700000]:
            HDB = [(tbl, pkg) for tbl, pkg in DataSetIter.iter_pkg(trainData)]
            HDB = [HDB[i] for i in sorted(random.sample(xrange(len(HDB)), datasize))]
            print('Datasize:', len(HDB))
            groups = defaultdict(list)
            for tbl, pkg in HDB:
                for host, key, value in get_f(pkg):
                    groups[host].append((pkg.trackId, tbl, key, value, pkg.app))

            print('Start mining frequent contexts')
            for host in groups:
                contexts = defaultdict(set)
                omega = set()
                seqs = defaultdict(set)
                sigAppMap = defaultdict(set); appSigMap = defaultdict(set); SC = defaultdict(set); date = defaultdict(set)
                for id, tbl, context, sig, app in groups[host]:
                    omega.add(app)
                    contexts[context].add(app)
                    sigAppMap[sig].add(app)
                    appSigMap[app].add(sig)
                    SC[sig].add(id)
                    date[(sig, app, context)].add(tbl)
                    date[(app, context)].add(tbl)
                    seqs[context].add(sig)

                for context, apps in contexts.items():
                    support = len(apps) * 1.0 / len(omega)
                    if support > conf.query_labelT:
                        effective = 0
                        for sig in seqs[context]:
                            if len(sigAppMap[sig]) == 1:
                                app = list(sigAppMap[sig])[0]
                                rel = 1.0 / len(appSigMap[app])
                                dateScore = 1.0 * len(date[(sig, app, context)]) / len(date[(app, context)])
                                effective += rel * dateScore
                        effective = effective / len(sigAppMap)
                        quality = 2 * effective * support / (effective + support)
                        if quality > conf.query_scoreT:
                            for sig in seqs[context]:
                                if len(sigAppMap[sig]) == 1:
                                    app = list(sigAppMap[sig])[0]
                                    rel = 1.0 / len(appSigMap[app])
                                    dateScore = 1.0 * len(date[(sig, app, context)]) / len(date[(app, context)])
                                    sigQ = rel * dateScore
                                    for id in SC[sig]:
                                        self.Rules[id].append((quality, sigQ, consts, host, sig))
                                        self.Rules[i] = sorted(self.Rules[i], key=lambda x: (x[1], x[2]), reverse=True)[:conf.query_K]
Esempio n. 8
0
    def __path_rules(self, trainSet):
        pathRules = defaultdict(lambda: defaultdict(set))
        tmpR = defaultdict(set)
        for pathSeg, labels in filter(lambda x: len(x[1]) == 1,
                                      self.pathLabel.iteritems()):
            label = list(labels)[0]
            fs = self.fLib[consts.APP_RULE][label]
            ifValid = part(fs, pathSeg)
            if ifValid:
                tmpR[consts.APP_RULE].add(pathSeg)

        for tbl, pkg in DataSetIter.iter_pkg(trainSet):
            pkgFs = set(get_f(pkg)[2:])
            for pathSeg in tmpR[consts.APP_RULE]:
                if pathSeg in pkgFs:
                    pathRules[consts.APP_RULE][(pkg.rawHost, pathSeg,
                                                pkg.label)].add(tbl)

        return pathRules
Esempio n. 9
0
    def train(self, trainSet, ruleType, datasize):
        counter = defaultdict(set)
        totalApps = set()
        for tbl, pkg in DataSetIter.iter_pkg(trainSet):
            if pkg.path == 'None':
                continue
            map(lambda w: counter[w].add(pkg.app),
                filter(None, pkg.path.split('/')))
            segPath = tuple(['^'] + filter(None, pkg.path.split('/')) + ['$'])
            host = re.sub('[0-9]+\.', '[0-9]+.', pkg.rawHost)
            self.HDB.append((segPath, pkg.app, len(segPath), host))
            totalApps.add(pkg.app)

        self.omega = len(totalApps) * self.support_t
        self.totalApp = len(totalApps) * 1.0

        #self.HDB = list(set(self.HDB))
        self.HDB = [
            self.HDB[i]
            for i in sorted(random.sample(xrange(len(self.HDB)), datasize))
        ]
        print("Data Size", len(self.HDB))

        groups = defaultdict(list)
        for (t, c, l, host) in self.HDB:
            groups[host].append((t, c, l))

        for host in groups:
            omega = set()
            for (t, c, l) in groups[host]:
                omega.add(c)
            if len(omega) == 1:
                groups[host] = []
                print('skipped', omega, host)

        for host in groups:
            self.host = host
            self.HDB = groups[host]

            for (t, c, l) in self.HDB:
                map(lambda w: counter[w].add(c), t)
            self.IDF = utils.cal_idf(counter)
            self.mine_context()