コード例 #1
0
ファイル: evaluate.py プロジェクト: nyimbi/ProjFocusedCrawler
 def buildClassifierFolder(self,posFile,negFolder,classifierFileName):
     #negURLsFile = 'negFile.txt'
     try:
         classifierFile = open(classifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
         
     except:
         posURLs = readFileLines(posFile)
         posLen = len(posURLs)
         negFiles = os.listdir(negFolder)
         negFiles = [os.path.join(negFolder,f) for f in negFiles if f.endswith(".txt")]
         #print negFiles
         negFilesURLs = [readFileLines(f) for f in negFiles]
         
         num = int(round(1.0* posLen/len(negFiles)))
         negURLs = []
         for nfu in negFilesURLs:
             #print len(nfu)
             if num < len(nfu):
                 #negURLs.extend(nfu[:num] )
                 negURLs.append(nfu[:num] )
             else:
                 #negURLs.extend(nfu )
                 negURLs.append(nfu )
         #print len(negURLs)
         #self.classifier = train_SaveClassifierRandom(posURLs, negURLs, classifierFileName)
         self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
コード例 #2
0
 def buildClassifierFolder(self,posFile,negFolder,classifierFileName):
     #negURLsFile = 'negFile.txt'
     try:
         classifierFile = open(classifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
         
     except:
         posURLs = readFileLines(posFile)
         posLen = len(posURLs)
         negFiles = os.listdir(negFolder)
         negFiles = [os.path.join(negFolder,f) for f in negFiles if f.endswith(".txt")]
         #print negFiles
         negFilesURLs = [readFileLines(f) for f in negFiles]
         
         num = int(round(1.0* posLen/len(negFiles)))
         negURLs = []
         for nfu in negFilesURLs:
             #print len(nfu)
             if num < len(nfu):
                 #negURLs.extend(nfu[:num] )
                 negURLs.append(nfu[:num] )
             else:
                 #negURLs.extend(nfu )
                 negURLs.append(nfu )
         #print len(negURLs)
         #self.classifier = train_SaveClassifierRandom(posURLs, negURLs, classifierFileName)
         self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
コード例 #3
0
def buildClassifier(eventName):
    classifierFileName = eventName+'_NBClassifier.p'    
    posURLsFileName = eventName+'Pos.txt'
    negURlsFileName = eventName+'Neg.txt'
    
    posURLs = eu.readFileLines(posURLsFileName)
    negURLs = eu.readFileLines(negURlsFileName)
    
    eu.train_SaveClassifier(posURLs, negURLs, classifierFileName)
コード例 #4
0
 def buildClassifier(self,posFile,negFile,classifierFileName):
     #negURLsFile = 'negFile.txt'
     try:
         classifierFile = open(classifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
         
     except:
         posURLs = readFileLines(posFile)
         negURLs = readFileLines(negFile)
         self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
コード例 #5
0
ファイル: evaluate.py プロジェクト: nyimbi/ProjFocusedCrawler
 def buildClassifier(self,posFile,negFile,classifierFileName):
     #negURLsFile = 'negFile.txt'
     try:
         classifierFile = open(classifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
         
     except:
         posURLs = readFileLines(posFile)
         negURLs = readFileLines(negFile)
         self.classifier = train_SaveClassifier(posURLs, negURLs, classifierFileName)
コード例 #6
0
def startCrawl(seedsFile,evaluator,modelFile,ct,num=5,pagesLimit=100, pageScoreThreshold=0.5,urlScoreThreshold=0):

    mode = 1 # URL scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode}
    crawlParams['No_Keywords']=num
    seedURLs = getSeedURLs(seedsFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    outputDir = seedsFile.split(".")[0]
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir=outputDir+"/base-webpages/"
        logDataFilename=pagesDir+"base-logData.txt"
        outputURLsFilename=pagesDir+"base-Output-URLs.txt"
        evalFilename=pagesDir+"base-evaluateData.txt"
        
        rp = baseFC(crawlParams)
        
    elif ct =='p':
        pagesDir=outputDir+"/prob-webpages/"
        logDataFilename=pagesDir+"prob-logData.txt"
        outputURLsFilename=pagesDir+"prob-Output-URLs.txt"
        evalFilename=pagesDir+"prob-evaluateData.txt"
        rp = probEventFC(crawlParams)
        
    elif ct =='e': 
        #eventRelevantPages = eventFC(crawlParams)
        pagesDir=outputDir+"/event-webpages/"
        logDataFilename=pagesDir+"event-logData.txt"
        outputURLsFilename=pagesDir+"event-Output-URLs.txt"
        evalFilename=pagesDir+"event-evaluateData.txt"
        rp = eventFC(crawlParams)
        
    
    #if not os.path.exists(outputDir):
    #    os.makedirs(outputDir)
    if not os.path.exists(pagesDir):
        os.makedirs(pagesDir)
    f = open(logDataFilename,"w")
    furl = open(outputURLsFilename,"w")
    
    for p in rp:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
        furl.write(p.pageUrl[1].encode("utf-8")+"\n")
        ftext = open(pagesDir+str(p.pageId) + ".txt", "w")
        ftext.write(p.text.encode("utf-8"))
        ftext.close()
    f.close()
    furl.close()
    
    res = evaluator.evaluateFC(rp)
    writeEvaluation(res,evalFilename)    
    print sum(res)
    print len(res)
コード例 #7
0
ファイル: evaluate.py プロジェクト: nyimbi/ProjFocusedCrawler
 def buildVSMClassifier(self,posFile,no_keywords,classifierFileName,error=0.05,roundPrec=3):
     try:
         classifierFile = open(classifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         #self.classifier.error = 0.05
         #self.classifier.relevanceth = 0.75
         classifierFile.close()
     except:
         self.classifier = VSMClassifier()
         posURLs = readFileLines(posFile)
         self.classifier.buildVSMClassifier(posURLs, no_keywords, classifierFileName,error,roundPrec)
         print self.classifier.relevanceth
         classifierFile = open(classifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
コード例 #8
0
    #plt.colorbar()
    plt.legend()
    # show graph
    plt.show()

def readGraphFile(graphFile):
    with open(graphFile) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ]
    graph = [(l.split(",")[0],l.split(',')[1]) for l in lines ]
    return graph

# draw example
urlsFile = 'base-Output-URLs.txt'
urls = eu.readFileLines(urlsFile)

doms = [eu.getDomain(url) for url in urls]

uniqueDomsFreqDic = eu.getFreq(doms)
uDoms = uniqueDomsFreqDic.keys()
numDoms = len(uDoms)
uc=[random.random() for i in range(numDoms)]
uniqDomsColorsDic = dict(zip(uDoms,uc))
#c = [uniqDomsColorsDic[d] for d in doms]
#c = c[5:]

domsTuples = enumerate(doms)
domsDic = dict(domsTuples)
#domsDic = defaultdict(list)
#for i,d in domsTuples:
コード例 #9
0
			urlsDomsMapping[urlDom]= [url]
	return urlsDomsMapping
	
def filterURLsByKeyword(keyword,urlsDomsMapping):
	filteredurlsDomsMap = {}
	for k in urlsDomsMapping:
		if keyword in k.lower():
			continue
		else:
			filteredurlsDomsMap[k.lower()] = urlsDomsMapping[k]
	return filteredurlsDomsMap
	
def rankLongURLsBySources(filteredURLsSourcesMapping):
	mappingList = filteredURLsSourcesMapping.items()
	rankedSourcesList = sorted(mappingList, key=lambda x: len(x[1]), reverse=True)
	rankedURLsList = []
	for rs,urlList in rankedSourcesList:
		#rankedURLsList.extend(filteredURLsSourcesMapping[rs])
		rankedURLsList.extend(urlList)
	return rankedURLsList

if __name__ == '__main__':
	longURLFileName = sys.argv[1]
	#sourcesFileName = ''
	keyword = 'twitter'
	longURLs = eventUtils.readFileLines(longURLFileName)
	longURLsSourcesMapping = getLongURLsSourcesMapping(longURLs)
	filteredURLsSourcesMapping = filterURLsByKeyword(keyword.lower(), longURLsSourcesMapping)
	#sources = eventUtils.readFileLines(sourcesFileName)
	rankedURLs = rankLongURLsBySources(filteredURLsSourcesMapping)
	eventUtils.saveListToFile(rankedURLs,longURLFileName.split('.')[0]+'-Ranked.txt')
コード例 #10
0
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)

    inputFile = seedsFiles[i]
    modelFile = modelFiles[i]  #'modelFile.txt'#inputFile

    mode = 1  # URL scoring with no page scoring
    crawlParams = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageTh,
        "urlScoreThreshold": urlsTh,
        "mode": mode
    }
    crawlParams['No_Keywords'] = noK
    seedURLs = getSeedURLs(inputFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model'] = modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    crawlParams['classifierFileName'] = vsmClassifierFileName
    outputDir = inputFile.split(".")[0]
    #crawlParams['t'] = t
    if ct == 'b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir = outputDir + "/base-webpages/"
        logDataFilename = pagesDir + "base-logData.txt"
        outputURLsFilename = pagesDir + "base-Output-URLs.txt"
        evalFilename = pagesDir + "base-evaluateData.txt"

        rp = baseFC(crawlParams)
        #rp = baseFC_OneTargetVector(crawlParams)
コード例 #11
0
def startCrawl(v,seedsFile,evaluator,modelFile,ct):

    #switchFC = 1
    #number of keywords to represent event/topic
    num = 15
    pagesLimit = 500
    
    pageScoreThreshold =0.7
    urlScoreThreshold = 0
    #mode = 0 # no URL scoring
    mode = 1 # URL scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode}
    crawlParams['No_Keywords']=num
    seedURLs = getSeedURLs(seedsFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    
    
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        
        
        rp = baseFC(crawlParams)
        
        f = open("base-webpages/"+str(v)+"/"+"base-logData.txt","w")
        furl = open("base-webpages/"+str(v)+"/"+"base-Output-URLs.txt","w")
        for p in rp:
            f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
            #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
            furl.write(p.pageUrl[1].encode("utf-8")+"\n")
            ftext = open("base-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w")
            ftext.write(p.text.encode("utf-8"))
            ftext.close()
        f.close()
        furl.close()
        
        res = evaluator.evaluateFC(rp)
        writeEvaluation(res,"base-webpages/"+str(v)+"/"+"base-evaluateData.txt")    
        print sum(res)
        print len(res)
    else: 
        #eventRelevantPages = eventFC(crawlParams)
        
        rp = eventFC(crawlParams)
        f = open("event-webpages/"+str(v)+"/"+"event-logData.txt","w")
        furl = open("event-webpages/"+str(v)+"/"+"event-Output-URLs.txt","w")
        for p in rp:
            f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
            #furl.write(p.pageUrl[1].encode('utf-8')+","+str(p.estimatedScore)+"\n")
            furl.write(p.pageUrl[1].encode('utf-8')+"\n")
            ftext = open("event-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w")
            ftext.write(p.text.encode("utf-8"))
            ftext.close()
        f.close()
        furl.close()
        res = evaluator.evaluateFC(rp)
        writeEvaluation(res,"event-webpages/"+str(v)+"/"+"event-evaluateData.txt")    
        print sum(res)
        print len(res)
コード例 #12
0
    # show graph
    plt.show()


def readGraphFile(graphFile):
    with open(graphFile) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ]
    graph = [(l.split(",")[0], l.split(',')[1]) for l in lines]
    return graph


# draw example
urlsFile = 'base-Output-URLs.txt'
urls = eu.readFileLines(urlsFile)

doms = [eu.getDomain(url) for url in urls]

uniqueDomsFreqDic = eu.getFreq(doms)
uDoms = uniqueDomsFreqDic.keys()
numDoms = len(uDoms)
uc = [random.random() for i in range(numDoms)]
uniqDomsColorsDic = dict(zip(uDoms, uc))
#c = [uniqDomsColorsDic[d] for d in doms]
#c = c[5:]

domsTuples = enumerate(doms)
domsDic = dict(domsTuples)
#domsDic = defaultdict(list)
#for i,d in domsTuples:
コード例 #13
0
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)
    classifierFileName = 'charlestonShooting_NBClassifier.p'
    evaluator.buildClassifier(posFile, negFile, classifierFileName)
    
    vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)

    inputFile = seedsFiles[i]
    modelFile = modelFiles[i]#'modelFile.txt'#inputFile
    
    mode = 1 # URL scoring with no page scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageTh,"urlScoreThreshold":urlsTh ,"mode":mode}
    crawlParams['No_Keywords']=noK
    seedURLs = getSeedURLs(inputFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    outputDir = inputFile.split(".")[0]
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir=outputDir+"/base-webpages/"
        logDataFilename=pagesDir+"base-logData.txt"
        outputURLsFilename=pagesDir+"base-Output-URLs.txt"
        evalFilename=pagesDir+"base-evaluateData.txt"
        
        rp = baseFC(crawlParams)
        
    elif ct =='p':