def startCrawl(seedsFile,evaluator,modelFile,ct,num=5,pagesLimit=100, pageScoreThreshold=0.5,urlScoreThreshold=0): mode = 1 # URL scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode} crawlParams['No_Keywords']=num seedURLs = getSeedURLs(seedsFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 outputDir = seedsFile.split(".")[0] #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) pagesDir=outputDir+"/base-webpages/" logDataFilename=pagesDir+"base-logData.txt" outputURLsFilename=pagesDir+"base-Output-URLs.txt" evalFilename=pagesDir+"base-evaluateData.txt" rp = baseFC(crawlParams) elif ct =='p': pagesDir=outputDir+"/prob-webpages/" logDataFilename=pagesDir+"prob-logData.txt" outputURLsFilename=pagesDir+"prob-Output-URLs.txt" evalFilename=pagesDir+"prob-evaluateData.txt" rp = probEventFC(crawlParams) elif ct =='e': #eventRelevantPages = eventFC(crawlParams) pagesDir=outputDir+"/event-webpages/" logDataFilename=pagesDir+"event-logData.txt" outputURLsFilename=pagesDir+"event-Output-URLs.txt" evalFilename=pagesDir+"event-evaluateData.txt" rp = eventFC(crawlParams) #if not os.path.exists(outputDir): # os.makedirs(outputDir) if not os.path.exists(pagesDir): os.makedirs(pagesDir) f = open(logDataFilename,"w") furl = open(outputURLsFilename,"w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode("utf-8")+"\n") ftext = open(pagesDir+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,evalFilename) print sum(res) print len(res)
#vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) inputFile = seedsFiles[i] modelFile = modelFiles[i] #'modelFile.txt'#inputFile mode = 1 # URL scoring with no page scoring crawlParams = { "num_pages": pagesLimit, "pageScoreThreshold": pageTh, "urlScoreThreshold": urlsTh, "mode": mode } crawlParams['No_Keywords'] = noK seedURLs = getSeedURLs(inputFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model'] = modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 crawlParams['classifierFileName'] = vsmClassifierFileName outputDir = inputFile.split(".")[0] #crawlParams['t'] = t if ct == 'b': #baseRelevantPages =baseFC(crawlParams) pagesDir = outputDir + "/base-webpages/" logDataFilename = pagesDir + "base-logData.txt" outputURLsFilename = pagesDir + "base-Output-URLs.txt" evalFilename = pagesDir + "base-evaluateData.txt"
def startCrawl(v,seedsFile,evaluator,modelFile,ct): #switchFC = 1 #number of keywords to represent event/topic num = 15 pagesLimit = 500 pageScoreThreshold =0.7 urlScoreThreshold = 0 #mode = 0 # no URL scoring mode = 1 # URL scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode} crawlParams['No_Keywords']=num seedURLs = getSeedURLs(seedsFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) rp = baseFC(crawlParams) f = open("base-webpages/"+str(v)+"/"+"base-logData.txt","w") furl = open("base-webpages/"+str(v)+"/"+"base-Output-URLs.txt","w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode("utf-8")+"\n") ftext = open("base-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,"base-webpages/"+str(v)+"/"+"base-evaluateData.txt") print sum(res) print len(res) else: #eventRelevantPages = eventFC(crawlParams) rp = eventFC(crawlParams) f = open("event-webpages/"+str(v)+"/"+"event-logData.txt","w") furl = open("event-webpages/"+str(v)+"/"+"event-Output-URLs.txt","w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode('utf-8')+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode('utf-8')+"\n") ftext = open("event-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,"event-webpages/"+str(v)+"/"+"event-evaluateData.txt") print sum(res) print len(res)
#vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) classifierFileName = 'charlestonShooting_NBClassifier.p' evaluator.buildClassifier(posFile, negFile, classifierFileName) vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) inputFile = seedsFiles[i] modelFile = modelFiles[i]#'modelFile.txt'#inputFile mode = 1 # URL scoring with no page scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageTh,"urlScoreThreshold":urlsTh ,"mode":mode} crawlParams['No_Keywords']=noK seedURLs = getSeedURLs(inputFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 outputDir = inputFile.split(".")[0] #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) pagesDir=outputDir+"/base-webpages/" logDataFilename=pagesDir+"base-logData.txt" outputURLsFilename=pagesDir+"base-Output-URLs.txt" evalFilename=pagesDir+"base-evaluateData.txt" rp = baseFC(crawlParams)