def evaluateVSM(targeEventFile, collFolder,k,relevTh,vsmClassifierFileName,topK):
    '''
    docs = []
    try:
        classifierFile = open(vsmClassifierFileName,"rb")
        classifier = pickle.load(classifierFile)
        classifierFile.close()
    except:    
        f = open(targeEventFile,'r')
        for url in f:
            url = url.strip()
            d = Document(url)
            if d:
                docs.append(d)
        f.close()
        docsTF = []
        for d in docs:
            wordsFreq = getFreq(d.getWords())
            docsTF.append(wordsFreq)
        
        classifier = VSMClassifier(docsTF,relevTh)
    
    evalres = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        r = classifier.calculate_score(ftext)[0]
        evalres.append(r)
        f.close()
    '''
    evaluator = Evaluate()
    evaluator.buildVSMClassifier(targeEventFile,vsmClassifierFileName,relevTh,topK)
    collFiles = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        o = myObj()
        o.text = ftext
        collFiles.append(o)
    res = evaluator.evaluateFC(collFiles)
    #f = open(collFolder+'evaluationRes_VSM.txt','w')
    #f.write('\n'.join([str(r) for r in res]))
    #f.close()
    #print sum(res)
    return res
Ejemplo n.º 2
0
def evaluateVSM(targeEventFile, collFolder,k,relevTh,vsmClassifierFileName,topK):
    '''
    docs = []
    try:
        classifierFile = open(vsmClassifierFileName,"rb")
        classifier = pickle.load(classifierFile)
        classifierFile.close()
    except:    
        f = open(targeEventFile,'r')
        for url in f:
            url = url.strip()
            d = Document(url)
            if d:
                docs.append(d)
        f.close()
        docsTF = []
        for d in docs:
            wordsFreq = getFreq(d.getWords())
            docsTF.append(wordsFreq)
        
        classifier = VSMClassifier(docsTF,relevTh)
    
    evalres = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        r = classifier.calculate_score(ftext)[0]
        evalres.append(r)
        f.close()
    '''
    evaluator = Evaluate()
    evaluator.buildVSMClassifier(targeEventFile,vsmClassifierFileName,relevTh,topK)
    collFiles = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        o = myObj()
        o.text = ftext
        collFiles.append(o)
    res = evaluator.evaluateFC(collFiles)
    #f = open(collFolder+'evaluationRes_VSM.txt','w')
    #f.write('\n'.join([str(r) for r in res]))
    #f.close()
    #print sum(res)
    return res
Ejemplo n.º 3
0
    #for i in range(3):
    pagesLimit = 1000

    noK = 10
    pageTh = 0.2
    urlsTh = 0
    i = 0
    ct = 'e'

    posFile = posFiles[i]
    negFile = negFiles[i]

    vsmClassifierFileName = 'classifierVSM-' + posFile.split(".")[0].split(
        '-')[0] + ".p"
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)
    evaluator.buildVSMClassifier(posFile, noK, vsmClassifierFileName)
    #classifierFileName = 'charlestonShooting_NBClassifier.p'
    #evaluator.buildClassifier(posFile, negFile, classifierFileName)

    #vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    #evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,pageTh,noK)

    inputFile = seedsFiles[i]
    modelFile = modelFiles[i]  #'modelFile.txt'#inputFile

    mode = 1  # URL scoring with no page scoring
    crawlParams = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageTh,
        "urlScoreThreshold": urlsTh,
        "mode": mode
    negFiles = ['neg-Charlie.txt','neg-sydneyseige.txt']
    '''
    
    evaluator = Evaluate()
    #for i in range(3):
    noK = 10
    th = 0.75
    i=3
    posFile = posFiles[i]
    negFile = negFiles[i]
    #modelFile = modelFile +"-"+str(i)+".txt"
    #classifierFileName = 'classifier'+posFile.split(".")[0].split('-')[1]+".p"
    vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    #evaluator.buildClassifier(posFile,negFolder,classifierFileName)
    #evaluator.buildClassifier(posFile,negFile,classifierFileName)
    evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,th,noK)

    v = 0

    inputFile = seedsFiles[i].split('.')[0]+"_"+str(v)+".txt"
    
    '''
    event = 'Charlie'
    posFile = 'pos_'+event+'.txt'
    classifierFileName = 'classifier_'+event+'.p'
    evaluator.buildClassifier(posFile,negFolder,classifierFileName)
    inputFile = 'seedURLs_'+event+'.txt'
    modelFile = 'modelURLs_'+ event + '.txt'
    '''
    crawlType = 'e'
    modelFile = inputFile