def evaluateVSM(targeEventFile, collFolder,k,relevTh,vsmClassifierFileName,topK): ''' docs = [] try: classifierFile = open(vsmClassifierFileName,"rb") classifier = pickle.load(classifierFile) classifierFile.close() except: f = open(targeEventFile,'r') for url in f: url = url.strip() d = Document(url) if d: docs.append(d) f.close() docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) classifier = VSMClassifier(docsTF,relevTh) evalres = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() r = classifier.calculate_score(ftext)[0] evalres.append(r) f.close() ''' evaluator = Evaluate() evaluator.buildVSMClassifier(targeEventFile,vsmClassifierFileName,relevTh,topK) collFiles = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() o = myObj() o.text = ftext collFiles.append(o) res = evaluator.evaluateFC(collFiles) #f = open(collFolder+'evaluationRes_VSM.txt','w') #f.write('\n'.join([str(r) for r in res])) #f.close() #print sum(res) return res
#for i in range(3): pagesLimit = 1000 noK = 10 pageTh = 0.2 urlsTh = 0 i = 0 ct = 'e' posFile = posFiles[i] negFile = negFiles[i] vsmClassifierFileName = 'classifierVSM-' + posFile.split(".")[0].split( '-')[0] + ".p" #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) evaluator.buildVSMClassifier(posFile, noK, vsmClassifierFileName) #classifierFileName = 'charlestonShooting_NBClassifier.p' #evaluator.buildClassifier(posFile, negFile, classifierFileName) #vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) inputFile = seedsFiles[i] modelFile = modelFiles[i] #'modelFile.txt'#inputFile mode = 1 # URL scoring with no page scoring crawlParams = { "num_pages": pagesLimit, "pageScoreThreshold": pageTh, "urlScoreThreshold": urlsTh, "mode": mode
negFiles = ['neg-Charlie.txt','neg-sydneyseige.txt'] ''' evaluator = Evaluate() #for i in range(3): noK = 10 th = 0.75 i=3 posFile = posFiles[i] negFile = negFiles[i] #modelFile = modelFile +"-"+str(i)+".txt" #classifierFileName = 'classifier'+posFile.split(".")[0].split('-')[1]+".p" vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" #evaluator.buildClassifier(posFile,negFolder,classifierFileName) #evaluator.buildClassifier(posFile,negFile,classifierFileName) evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,th,noK) v = 0 inputFile = seedsFiles[i].split('.')[0]+"_"+str(v)+".txt" ''' event = 'Charlie' posFile = 'pos_'+event+'.txt' classifierFileName = 'classifier_'+event+'.p' evaluator.buildClassifier(posFile,negFolder,classifierFileName) inputFile = 'seedURLs_'+event+'.txt' modelFile = 'modelURLs_'+ event + '.txt' ''' crawlType = 'e' modelFile = inputFile