コード例 #1
0
ファイル: clustering.py プロジェクト: sathify/clustering
 def computeTFIDF(self, documentFrequencies, corpusSize=81):
     contents = cleanup.filterStopwords(self.contents)
     contents = cleanup.stripPunctuation(contents)
     scoredWords = self.frequency(contents)
     for word in scoredWords:
         if word in documentFrequencies:
             scoredWords[word] = scoredWords[word] * math.log(corpusSize / documentFrequencies[word])
         else:
             scoredWords[word] = scoredWords[word] * math.log(corpusSize)
     self.tfidfscores = scoredWords
コード例 #2
0
ファイル: clusters.py プロジェクト: sathify/clustering
def buildCorpus(Dir) :
    dict ={}
    size = 0
    for Class in os.listdir(Dir):
        dir=os.path.join(Dir,Class)
        fileList=os.listdir(dir)
        size += len(fileList)
        for file in fileList:
            path=os.path.join(dir,file)
            data=open(path,'r').read()
            contents = cleanup.filterStopwords(data.split())
            contents = cleanup.stripPunctuation(contents)
            for word in set(contents) :
                try:
                    dict[word] += 1
                except:
                    dict[word] = 1
    pickle.dump(dict, open('dictionary','w')) 
    return size, dict   
コード例 #3
0
ファイル: clustering.py プロジェクト: sathify/clustering
 def setupdocuments(self, Dir):
     size = 0
     corpusfile = open("dictionary", "r")
     corpus = pickle.load(corpusfile)
     for Class in os.listdir(Dir):
         dir = os.path.join(Dir, Class)
         fileList = os.listdir(dir)
         size += len(fileList)
         for file in fileList:
             path = os.path.join(dir, file)
             try:
                 data = open(path, "r").read()
                 contents = cleanup.filterStopwords(data.split())
                 contents = cleanup.stripPunctuation(contents)
                 d = document(file, Class, contents)
                 d.computeTFIDF(corpus)
                 self.alldocuments[d] = 1
             except:
                 pass