def startCrawl(seedsFile,evaluator,modelFile,ct,num=5,pagesLimit=100, pageScoreThreshold=0.5,urlScoreThreshold=0):

    mode = 1 # URL scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode}
    crawlParams['No_Keywords']=num
    seedURLs = getSeedURLs(seedsFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    outputDir = seedsFile.split(".")[0]
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir=outputDir+"/base-webpages/"
        logDataFilename=pagesDir+"base-logData.txt"
        outputURLsFilename=pagesDir+"base-Output-URLs.txt"
        evalFilename=pagesDir+"base-evaluateData.txt"
        
        rp = baseFC(crawlParams)
        
    elif ct =='p':
        pagesDir=outputDir+"/prob-webpages/"
        logDataFilename=pagesDir+"prob-logData.txt"
        outputURLsFilename=pagesDir+"prob-Output-URLs.txt"
        evalFilename=pagesDir+"prob-evaluateData.txt"
        rp = probEventFC(crawlParams)
        
    elif ct =='e': 
        #eventRelevantPages = eventFC(crawlParams)
        pagesDir=outputDir+"/event-webpages/"
        logDataFilename=pagesDir+"event-logData.txt"
        outputURLsFilename=pagesDir+"event-Output-URLs.txt"
        evalFilename=pagesDir+"event-evaluateData.txt"
        rp = eventFC(crawlParams)
        
    
    #if not os.path.exists(outputDir):
    #    os.makedirs(outputDir)
    if not os.path.exists(pagesDir):
        os.makedirs(pagesDir)
    f = open(logDataFilename,"w")
    furl = open(outputURLsFilename,"w")
    
    for p in rp:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
        furl.write(p.pageUrl[1].encode("utf-8")+"\n")
        ftext = open(pagesDir+str(p.pageId) + ".txt", "w")
        ftext.write(p.text.encode("utf-8"))
        ftext.close()
    f.close()
    furl.close()
    
    res = evaluator.evaluateFC(rp)
    writeEvaluation(res,evalFilename)    
    print sum(res)
    print len(res)
Esempio n. 2
0
    #vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)

    inputFile = seedsFiles[i]
    modelFile = modelFiles[i]  #'modelFile.txt'#inputFile

    mode = 1  # URL scoring with no page scoring
    crawlParams = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageTh,
        "urlScoreThreshold": urlsTh,
        "mode": mode
    }
    crawlParams['No_Keywords'] = noK
    seedURLs = getSeedURLs(inputFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model'] = modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    crawlParams['classifierFileName'] = vsmClassifierFileName
    outputDir = inputFile.split(".")[0]
    #crawlParams['t'] = t
    if ct == 'b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir = outputDir + "/base-webpages/"
        logDataFilename = pagesDir + "base-logData.txt"
        outputURLsFilename = pagesDir + "base-Output-URLs.txt"
        evalFilename = pagesDir + "base-evaluateData.txt"
def startCrawl(v,seedsFile,evaluator,modelFile,ct):

    #switchFC = 1
    #number of keywords to represent event/topic
    num = 15
    pagesLimit = 500
    
    pageScoreThreshold =0.7
    urlScoreThreshold = 0
    #mode = 0 # no URL scoring
    mode = 1 # URL scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold ,"mode":mode}
    crawlParams['No_Keywords']=num
    seedURLs = getSeedURLs(seedsFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    
    
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        
        
        rp = baseFC(crawlParams)
        
        f = open("base-webpages/"+str(v)+"/"+"base-logData.txt","w")
        furl = open("base-webpages/"+str(v)+"/"+"base-Output-URLs.txt","w")
        for p in rp:
            f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
            #furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
            furl.write(p.pageUrl[1].encode("utf-8")+"\n")
            ftext = open("base-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w")
            ftext.write(p.text.encode("utf-8"))
            ftext.close()
        f.close()
        furl.close()
        
        res = evaluator.evaluateFC(rp)
        writeEvaluation(res,"base-webpages/"+str(v)+"/"+"base-evaluateData.txt")    
        print sum(res)
        print len(res)
    else: 
        #eventRelevantPages = eventFC(crawlParams)
        
        rp = eventFC(crawlParams)
        f = open("event-webpages/"+str(v)+"/"+"event-logData.txt","w")
        furl = open("event-webpages/"+str(v)+"/"+"event-Output-URLs.txt","w")
        for p in rp:
            f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
            #furl.write(p.pageUrl[1].encode('utf-8')+","+str(p.estimatedScore)+"\n")
            furl.write(p.pageUrl[1].encode('utf-8')+"\n")
            ftext = open("event-webpages/"+str(v)+"/"+str(p.pageId) + ".txt", "w")
            ftext.write(p.text.encode("utf-8"))
            ftext.close()
        f.close()
        furl.close()
        res = evaluator.evaluateFC(rp)
        writeEvaluation(res,"event-webpages/"+str(v)+"/"+"event-evaluateData.txt")    
        print sum(res)
        print len(res)
    
    #vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)
    classifierFileName = 'charlestonShooting_NBClassifier.p'
    evaluator.buildClassifier(posFile, negFile, classifierFileName)
    
    vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)

    inputFile = seedsFiles[i]
    modelFile = modelFiles[i]#'modelFile.txt'#inputFile
    
    mode = 1 # URL scoring with no page scoring
    crawlParams = {"num_pages": pagesLimit,"pageScoreThreshold":pageTh,"urlScoreThreshold":urlsTh ,"mode":mode}
    crawlParams['No_Keywords']=noK
    seedURLs = getSeedURLs(inputFile)
    crawlParams['seedURLs'] = seedURLs
    modelURLs = readFileLines(modelFile)
    crawlParams['model']=modelURLs
    crawlParams['restricted'] = 0
    crawlParams['combineScore'] = 0
    outputDir = inputFile.split(".")[0]
    #crawlParams['t'] = t
    if ct =='b':
        #baseRelevantPages =baseFC(crawlParams)
        pagesDir=outputDir+"/base-webpages/"
        logDataFilename=pagesDir+"base-logData.txt"
        outputURLsFilename=pagesDir+"base-Output-URLs.txt"
        evalFilename=pagesDir+"base-evaluateData.txt"
        
        rp = baseFC(crawlParams)