コード例 #1
0
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     vocabTFDic[w] += wordsFreq[w]
                 else:
                     vocabTFDic[w] = wordsFreq[w]
         
         vocabSorted = getSorted(vocabTFDic.items(), 1)
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
         
         ndocsTF = []
         '''
         for d in docsTF:
             ndocTF = {}
             for k in topVocabDic:
                 if k in d:
                     ndocTF[k] = d[k]
                 else: 
                     ndocTF[k] = 1/math.e
             ndocsTF.append(ndocTF)
          '''   
         
         self.classifier = VSMClassifier(topVocabDic,ndocsTF,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
コード例 #2
0
 def buildVSMClassifier_OneTargetTopicVector(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         n = len(docs)
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     #vocabTFDic[w] += wordsFreq[w]
                     vocabTFDic[w].append( wordsFreq[w])
                 else:
                     vocabTFDic[w] = [wordsFreq[w]]
         #vocTF_IDF = [(w,sum(vocabTFDic[w])*math.log(n*1.0/len(vocabTFDic[w]))) for w in vocabTFDic]
         idf = 1.0
         vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic]
          
         #vocabSorted = getSorted(vocabTFDic.items(), 1)
         vocabSorted = getSorted(vocTF_IDF, 1)
         print vocabSorted[:topK]
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
          
         
         self.classifier = VSMClassifier(topVocabDic,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
コード例 #3
0
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,leastK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
        
         #docsBOW = []
         vocabTFDic = defaultdict([])
         #n = len(docs)
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsBOW.append(wordsFreq)
             for w in wordsFreq:
                 vocabTFDic[w].append( wordsFreq[w])
         
         #idf = 1.0
         #vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic]
         voc_CollFreq = [(w,sum(vocabTFDic[w])) for w in vocabTFDic]
         vocab_filtered = [(w,f) for w in voc_CollFreq if f>= leastK] 
         vocab_filtered_dict = dict(vocab_filtered)
         #vocabSorted = getSorted(voc_CollFreq, 1)
         '''
         print vocabSorted[:topK]
         topVocabDic = dict(vocabSorted[:topK])
         '''
         
         self.classifier = VSMClassifier(vocab_filtered_dict,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()