def buildClassifierFolder(self,posFile,negFolder,classifierFileName): #negURLsFile = 'negFile.txt' try: classifierFile = open(classifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: posURLs = readFileLines(posFile) posLen = len(posURLs) negFiles = os.listdir(negFolder) negFiles = [os.path.join(negFolder,f) for f in negFiles if f.endswith(".txt")] #print negFiles negFilesURLs = [readFileLines(f) for f in negFiles] num = int(round(1.0* posLen/len(negFiles))) negURLs = [] for nfu in negFilesURLs: #print len(nfu) if num < len(nfu): #negURLs.extend(nfu[:num] ) negURLs.append(nfu[:num] ) else: #negURLs.extend(nfu ) negURLs.append(nfu ) #print len(negURLs) #self.classifier = train_SaveClassifierRandom(posURLs, negURLs, classifierFileName) self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
def buildClassifier(eventName): classifierFileName = eventName+'_NBClassifier.p' posURLsFileName = eventName+'Pos.txt' negURlsFileName = eventName+'Neg.txt' posURLs = eu.readFileLines(posURLsFileName) negURLs = eu.readFileLines(negURlsFileName) eu.train_SaveClassifier(posURLs, negURLs, classifierFileName)
def buildClassifier(self,posFile,negFile,classifierFileName): #negURLsFile = 'negFile.txt' try: classifierFile = open(classifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: posURLs = readFileLines(posFile) negURLs = readFileLines(negFile) self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
def startCrawl(seedsFile,evaluator,modelFile,ct,num=5,pagesLimit=100, pageScoreThreshold=0.5,urlScoreThreshold=0): mode = 1 # URL scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode} crawlParams['No_Keywords']=num seedURLs = getSeedURLs(seedsFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 outputDir = seedsFile.split(".")[0] #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) pagesDir=outputDir+"/base-webpages/" logDataFilename=pagesDir+"base-logData.txt" outputURLsFilename=pagesDir+"base-Output-URLs.txt" evalFilename=pagesDir+"base-evaluateData.txt" rp = baseFC(crawlParams) elif ct =='p': pagesDir=outputDir+"/prob-webpages/" logDataFilename=pagesDir+"prob-logData.txt" outputURLsFilename=pagesDir+"prob-Output-URLs.txt" evalFilename=pagesDir+"prob-evaluateData.txt" rp = probEventFC(crawlParams) elif ct =='e': #eventRelevantPages = eventFC(crawlParams) pagesDir=outputDir+"/event-webpages/" logDataFilename=pagesDir+"event-logData.txt" outputURLsFilename=pagesDir+"event-Output-URLs.txt" evalFilename=pagesDir+"event-evaluateData.txt" rp = eventFC(crawlParams) #if not os.path.exists(outputDir): # os.makedirs(outputDir) if not os.path.exists(pagesDir): os.makedirs(pagesDir) f = open(logDataFilename,"w") furl = open(outputURLsFilename,"w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode("utf-8")+"\n") ftext = open(pagesDir+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,evalFilename) print sum(res) print len(res)
def buildVSMClassifier(self,posFile,no_keywords,classifierFileName,error=0.05,roundPrec=3): try: classifierFile = open(classifierFileName,"rb") self.classifier = pickle.load(classifierFile) #self.classifier.error = 0.05 #self.classifier.relevanceth = 0.75 classifierFile.close() except: self.classifier = VSMClassifier() posURLs = readFileLines(posFile) self.classifier.buildVSMClassifier(posURLs, no_keywords, classifierFileName,error,roundPrec) print self.classifier.relevanceth classifierFile = open(classifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
#plt.colorbar() plt.legend() # show graph plt.show() def readGraphFile(graphFile): with open(graphFile) as f: lines = f.readlines() lines = [l.strip() for l in lines] #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ] graph = [(l.split(",")[0],l.split(',')[1]) for l in lines ] return graph # draw example urlsFile = 'base-Output-URLs.txt' urls = eu.readFileLines(urlsFile) doms = [eu.getDomain(url) for url in urls] uniqueDomsFreqDic = eu.getFreq(doms) uDoms = uniqueDomsFreqDic.keys() numDoms = len(uDoms) uc=[random.random() for i in range(numDoms)] uniqDomsColorsDic = dict(zip(uDoms,uc)) #c = [uniqDomsColorsDic[d] for d in doms] #c = c[5:] domsTuples = enumerate(doms) domsDic = dict(domsTuples) #domsDic = defaultdict(list) #for i,d in domsTuples:
urlsDomsMapping[urlDom]= [url] return urlsDomsMapping def filterURLsByKeyword(keyword,urlsDomsMapping): filteredurlsDomsMap = {} for k in urlsDomsMapping: if keyword in k.lower(): continue else: filteredurlsDomsMap[k.lower()] = urlsDomsMapping[k] return filteredurlsDomsMap def rankLongURLsBySources(filteredURLsSourcesMapping): mappingList = filteredURLsSourcesMapping.items() rankedSourcesList = sorted(mappingList, key=lambda x: len(x[1]), reverse=True) rankedURLsList = [] for rs,urlList in rankedSourcesList: #rankedURLsList.extend(filteredURLsSourcesMapping[rs]) rankedURLsList.extend(urlList) return rankedURLsList if __name__ == '__main__': longURLFileName = sys.argv[1] #sourcesFileName = '' keyword = 'twitter' longURLs = eventUtils.readFileLines(longURLFileName) longURLsSourcesMapping = getLongURLsSourcesMapping(longURLs) filteredURLsSourcesMapping = filterURLsByKeyword(keyword.lower(), longURLsSourcesMapping) #sources = eventUtils.readFileLines(sourcesFileName) rankedURLs = rankLongURLsBySources(filteredURLsSourcesMapping) eventUtils.saveListToFile(rankedURLs,longURLFileName.split('.')[0]+'-Ranked.txt')
#evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) inputFile = seedsFiles[i] modelFile = modelFiles[i] #'modelFile.txt'#inputFile mode = 1 # URL scoring with no page scoring crawlParams = { "num_pages": pagesLimit, "pageScoreThreshold": pageTh, "urlScoreThreshold": urlsTh, "mode": mode } crawlParams['No_Keywords'] = noK seedURLs = getSeedURLs(inputFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model'] = modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 crawlParams['classifierFileName'] = vsmClassifierFileName outputDir = inputFile.split(".")[0] #crawlParams['t'] = t if ct == 'b': #baseRelevantPages =baseFC(crawlParams) pagesDir = outputDir + "/base-webpages/" logDataFilename = pagesDir + "base-logData.txt" outputURLsFilename = pagesDir + "base-Output-URLs.txt" evalFilename = pagesDir + "base-evaluateData.txt" rp = baseFC(crawlParams) #rp = baseFC_OneTargetVector(crawlParams)
def startCrawl(v,seedsFile,evaluator,modelFile,ct): #switchFC = 1 #number of keywords to represent event/topic num = 15 pagesLimit = 500 pageScoreThreshold =0.7 urlScoreThreshold = 0 #mode = 0 # no URL scoring mode = 1 # URL scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode} crawlParams['No_Keywords']=num seedURLs = getSeedURLs(seedsFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) rp = baseFC(crawlParams) f = open("base-webpages/"+str(v)+"/"+"base-logData.txt","w") furl = open("base-webpages/"+str(v)+"/"+"base-Output-URLs.txt","w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode("utf-8")+"\n") ftext = open("base-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,"base-webpages/"+str(v)+"/"+"base-evaluateData.txt") print sum(res) print len(res) else: #eventRelevantPages = eventFC(crawlParams) rp = eventFC(crawlParams) f = open("event-webpages/"+str(v)+"/"+"event-logData.txt","w") furl = open("event-webpages/"+str(v)+"/"+"event-Output-URLs.txt","w") for p in rp: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") #furl.write(p.pageUrl[1].encode('utf-8')+","+str(p.estimatedScore)+"\n") furl.write(p.pageUrl[1].encode('utf-8')+"\n") ftext = open("event-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,"event-webpages/"+str(v)+"/"+"event-evaluateData.txt") print sum(res) print len(res)
# show graph plt.show() def readGraphFile(graphFile): with open(graphFile) as f: lines = f.readlines() lines = [l.strip() for l in lines] #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ] graph = [(l.split(",")[0], l.split(',')[1]) for l in lines] return graph # draw example urlsFile = 'base-Output-URLs.txt' urls = eu.readFileLines(urlsFile) doms = [eu.getDomain(url) for url in urls] uniqueDomsFreqDic = eu.getFreq(doms) uDoms = uniqueDomsFreqDic.keys() numDoms = len(uDoms) uc = [random.random() for i in range(numDoms)] uniqDomsColorsDic = dict(zip(uDoms, uc)) #c = [uniqDomsColorsDic[d] for d in doms] #c = c[5:] domsTuples = enumerate(doms) domsDic = dict(domsTuples) #domsDic = defaultdict(list) #for i,d in domsTuples:
#evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) classifierFileName = 'charlestonShooting_NBClassifier.p' evaluator.buildClassifier(posFile, negFile, classifierFileName) vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK) inputFile = seedsFiles[i] modelFile = modelFiles[i]#'modelFile.txt'#inputFile mode = 1 # URL scoring with no page scoring crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageTh,"urlScoreThreshold":urlsTh ,"mode":mode} crawlParams['No_Keywords']=noK seedURLs = getSeedURLs(inputFile) crawlParams['seedURLs'] = seedURLs modelURLs = readFileLines(modelFile) crawlParams['model']=modelURLs crawlParams['restricted'] = 0 crawlParams['combineScore'] = 0 outputDir = inputFile.split(".")[0] #crawlParams['t'] = t if ct =='b': #baseRelevantPages =baseFC(crawlParams) pagesDir=outputDir+"/base-webpages/" logDataFilename=pagesDir+"base-logData.txt" outputURLsFilename=pagesDir+"base-Output-URLs.txt" evalFilename=pagesDir+"base-evaluateData.txt" rp = baseFC(crawlParams) elif ct =='p':