コード例 #1
0
 def trainNB(self, stopWordFlag):
     self.stopWordFlag = stopWordFlag
     parser = dataParser()
     totalDocs = 0
     self.build_vocab()
     totalDocs = self.build_wordFreq(parser, totalDocs)
     self.calc_cond_prob(totalDocs)
コード例 #2
0
    def applyNB(self, file):
        parser = dataParser()
        score = {}
        wordList = parser.parse_file(file)
        for cl in self.classes:
            score[cl] = math.log(self.prior[cl])
            for w in wordList:
                if w in self.vocab and cl in self.condProb[w]:
                    score[cl] += math.log10(self.condProb[w][cl])

        return "spam" if score["spam"] > score["ham"] else "ham"
コード例 #3
0
 def countWords(self):
     parser = dataParser()
     self.countMatrix = {}
     for c in self.classes:
         self.countMatrix[c] = {}
         for file in self.trainData[c].files:
             wordList = parser.parse_file(self.trainPath + "/" + c + "/" +
                                          file)
             wordListFreq = {}
             self.addToList(wordList, wordListFreq)
             if self.stopWordFlag:
                 self.rmvSWTest(wordListFreq)
             self.countMatrix[c][file] = copy.deepcopy(wordListFreq)
コード例 #4
0
    def build_vocab(self):
        parser = dataParser()

        self.vocab = set()
        for c in self.classes:
            for file in self.trainData[c].files:
                wordList = parser.parse_file(self.trainPath + "/" + c + "/" +
                                             file)
                self.vocab.update(wordList)

        if self.stopWordFlag:
            for w in self.stopWords:
                if w in self.vocab:
                    self.vocab.remove(w)
コード例 #5
0
    def trainLogistic(self, stopWordFlag):
        self.stopWordFlag = stopWordFlag

        self.build_vocab()
        totalDocs = 0
        self.build_wordFreq(parser=dataParser(), totalDocs=totalDocs)

        initial_weights = 0.01
        self.weightList["weight_zero"] = initial_weights
        for w in self.vocab:
            self.weightList[w] = initial_weights
        self.countWords()
        for i in range(1, self.iterations):
            self.updateWeights()
コード例 #6
0
 def testLogistic(self):
     parser = dataParser()
     hits = 0
     totalDocs = 0
     for c in self.classes:
         totalDocs += len(self.testData[c].files)
         for file in self.testData[c].files:
             wordList = parser.parse_file(self.testPath + "/" + c + "/" +
                                          file)
             wordListFreq = {}
             self.addToList(wordList, wordListFreq)
             pred = (int)(round(self.calc_prob(wordListFreq)))
             if c == self.classes[pred]:
                 hits += 1
     return ((float)(hits) * 100) / totalDocs
コード例 #7
0
    def __init__(self, trainPath, testPath, stopWordFile):
        self.trainPath = trainPath
        self.testPath = testPath

        self.stopWords = dataParser().parse_file(stopWordFile)

        self.classes = ['ham', 'spam']

        self.trainData = {}
        for c in self.classes:
            self.trainData[c] = Model(self.trainPath, c)

        self.testData = {}
        for c in self.classes:
            self.testData[c] = Model(self.testPath, c)

        self.vocab = set()
        self.prior = {}
        self.condProb = {}
コード例 #8
0
    def __init__(self, trainPath, testPath, stopWordFile, learnRate, regFactor,
                 iterations):
        self.trainPath = trainPath
        self.testPath = testPath
        self.learnRate = learnRate
        self.regFactor = regFactor
        self.iterations = iterations

        self.stopWords = dataParser().parse_file(stopWordFile)

        self.classes = ['ham', 'spam']

        self.trainData = {}
        for c in self.classes:
            self.trainData[c] = Model(self.trainPath, c)

        self.testData = {}
        for c in self.classes:
            self.testData[c] = Model(self.testPath, c)

        self.vocab = set()
        self.weightList = {}