def computeTFIDF(self, documentFrequencies, corpusSize=81): contents = cleanup.filterStopwords(self.contents) contents = cleanup.stripPunctuation(contents) scoredWords = self.frequency(contents) for word in scoredWords: if word in documentFrequencies: scoredWords[word] = scoredWords[word] * math.log(corpusSize / documentFrequencies[word]) else: scoredWords[word] = scoredWords[word] * math.log(corpusSize) self.tfidfscores = scoredWords
def buildCorpus(Dir) : dict ={} size = 0 for Class in os.listdir(Dir): dir=os.path.join(Dir,Class) fileList=os.listdir(dir) size += len(fileList) for file in fileList: path=os.path.join(dir,file) data=open(path,'r').read() contents = cleanup.filterStopwords(data.split()) contents = cleanup.stripPunctuation(contents) for word in set(contents) : try: dict[word] += 1 except: dict[word] = 1 pickle.dump(dict, open('dictionary','w')) return size, dict
def setupdocuments(self, Dir): size = 0 corpusfile = open("dictionary", "r") corpus = pickle.load(corpusfile) for Class in os.listdir(Dir): dir = os.path.join(Dir, Class) fileList = os.listdir(dir) size += len(fileList) for file in fileList: path = os.path.join(dir, file) try: data = open(path, "r").read() contents = cleanup.filterStopwords(data.split()) contents = cleanup.stripPunctuation(contents) d = document(file, Class, contents) d.computeTFIDF(corpus) self.alldocuments[d] = 1 except: pass