def trainNB(self, stopWordFlag): self.stopWordFlag = stopWordFlag parser = dataParser() totalDocs = 0 self.build_vocab() totalDocs = self.build_wordFreq(parser, totalDocs) self.calc_cond_prob(totalDocs)
def applyNB(self, file): parser = dataParser() score = {} wordList = parser.parse_file(file) for cl in self.classes: score[cl] = math.log(self.prior[cl]) for w in wordList: if w in self.vocab and cl in self.condProb[w]: score[cl] += math.log10(self.condProb[w][cl]) return "spam" if score["spam"] > score["ham"] else "ham"
def countWords(self): parser = dataParser() self.countMatrix = {} for c in self.classes: self.countMatrix[c] = {} for file in self.trainData[c].files: wordList = parser.parse_file(self.trainPath + "/" + c + "/" + file) wordListFreq = {} self.addToList(wordList, wordListFreq) if self.stopWordFlag: self.rmvSWTest(wordListFreq) self.countMatrix[c][file] = copy.deepcopy(wordListFreq)
def build_vocab(self): parser = dataParser() self.vocab = set() for c in self.classes: for file in self.trainData[c].files: wordList = parser.parse_file(self.trainPath + "/" + c + "/" + file) self.vocab.update(wordList) if self.stopWordFlag: for w in self.stopWords: if w in self.vocab: self.vocab.remove(w)
def trainLogistic(self, stopWordFlag): self.stopWordFlag = stopWordFlag self.build_vocab() totalDocs = 0 self.build_wordFreq(parser=dataParser(), totalDocs=totalDocs) initial_weights = 0.01 self.weightList["weight_zero"] = initial_weights for w in self.vocab: self.weightList[w] = initial_weights self.countWords() for i in range(1, self.iterations): self.updateWeights()
def testLogistic(self): parser = dataParser() hits = 0 totalDocs = 0 for c in self.classes: totalDocs += len(self.testData[c].files) for file in self.testData[c].files: wordList = parser.parse_file(self.testPath + "/" + c + "/" + file) wordListFreq = {} self.addToList(wordList, wordListFreq) pred = (int)(round(self.calc_prob(wordListFreq))) if c == self.classes[pred]: hits += 1 return ((float)(hits) * 100) / totalDocs
def __init__(self, trainPath, testPath, stopWordFile): self.trainPath = trainPath self.testPath = testPath self.stopWords = dataParser().parse_file(stopWordFile) self.classes = ['ham', 'spam'] self.trainData = {} for c in self.classes: self.trainData[c] = Model(self.trainPath, c) self.testData = {} for c in self.classes: self.testData[c] = Model(self.testPath, c) self.vocab = set() self.prior = {} self.condProb = {}
def __init__(self, trainPath, testPath, stopWordFile, learnRate, regFactor, iterations): self.trainPath = trainPath self.testPath = testPath self.learnRate = learnRate self.regFactor = regFactor self.iterations = iterations self.stopWords = dataParser().parse_file(stopWordFile) self.classes = ['ham', 'spam'] self.trainData = {} for c in self.classes: self.trainData[c] = Model(self.trainPath, c) self.testData = {} for c in self.classes: self.testData[c] = Model(self.testPath, c) self.vocab = set() self.weightList = {}