def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() ''' docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) self.classifier = VSMClassifier(docsTF,th) ''' docsTF = [] vocabTFDic = {} for d in docs: wordsFreq = getFreq(d.getWords()) #docsTF.append(wordsFreq) for w in wordsFreq: if w in vocabTFDic: vocabTFDic[w] += wordsFreq[w] else: vocabTFDic[w] = wordsFreq[w] vocabSorted = getSorted(vocabTFDic.items(), 1) topVocabDic = dict(vocabSorted[:topK]) #topVocabDic = vocabTFDic ndocsTF = [] ''' for d in docsTF: ndocTF = {} for k in topVocabDic: if k in d: ndocTF[k] = d[k] else: ndocTF[k] = 1/math.e ndocsTF.append(ndocTF) ''' self.classifier = VSMClassifier(topVocabDic,ndocsTF,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def buildVSMClassifier_OneTargetTopicVector(self,posFile,vsmClassifierFileName,th,topK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() ''' docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) self.classifier = VSMClassifier(docsTF,th) ''' docsTF = [] vocabTFDic = {} n = len(docs) for d in docs: wordsFreq = getFreq(d.getWords()) #docsTF.append(wordsFreq) for w in wordsFreq: if w in vocabTFDic: #vocabTFDic[w] += wordsFreq[w] vocabTFDic[w].append( wordsFreq[w]) else: vocabTFDic[w] = [wordsFreq[w]] #vocTF_IDF = [(w,sum(vocabTFDic[w])*math.log(n*1.0/len(vocabTFDic[w]))) for w in vocabTFDic] idf = 1.0 vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic] #vocabSorted = getSorted(vocabTFDic.items(), 1) vocabSorted = getSorted(vocTF_IDF, 1) print vocabSorted[:topK] topVocabDic = dict(vocabSorted[:topK]) #topVocabDic = vocabTFDic self.classifier = VSMClassifier(topVocabDic,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,leastK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() #docsBOW = [] vocabTFDic = defaultdict([]) #n = len(docs) for d in docs: wordsFreq = getFreq(d.getWords()) #docsBOW.append(wordsFreq) for w in wordsFreq: vocabTFDic[w].append( wordsFreq[w]) #idf = 1.0 #vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic] voc_CollFreq = [(w,sum(vocabTFDic[w])) for w in vocabTFDic] vocab_filtered = [(w,f) for w in voc_CollFreq if f>= leastK] vocab_filtered_dict = dict(vocab_filtered) #vocabSorted = getSorted(voc_CollFreq, 1) ''' print vocabSorted[:topK] topVocabDic = dict(vocabSorted[:topK]) ''' self.classifier = VSMClassifier(vocab_filtered_dict,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()