Esempio n. 1
0
 def evaluate(self, ident, pct):
     for i in range(1):
         results = {}
         (train_corpus, predict_corpus) = self.corpus.load_corpus(ident, float(pct), True, True)
         savepickle(paths['svm.svm_data_tmp']+'wordids.pkl', self.corpus.wordids)
         self.generate_libsvm_input(train_corpus, 'train')
         self.generate_libsvm_input(predict_corpus, 'predict')
         print "Done generating SVM input."
         results = self.libsvm_train(True)
         print "Iteration", i, ", pct", pct
         print results
Esempio n. 2
0
def recallprecision(articles):
    """
    recall and precision for simularity method, make all pairs from urllist
    and compute the similarity.
    """
    urlids = articles.keys()
    truepos = {}
    falsepos = {}
    # cutoff range [0.01, 1.0]
    cutoff_begin = 0
    cutoff_end = 100
    for cutoff in range(cutoff_begin, cutoff_end):
        truepos[cutoff]=0
        falsepos[cutoff]=0
    
    total = 0
    pos = 0
    N = len(urlids)
    print "Number of articles: %d" % N
    progress_step = N*(N-1)/200.0
    
    simvals = {}
    for i in range(0, N-1):
        if (i % (N/10)) == 0:
            print "Progress: %.0f%%" % (float(total)/progress_step)
        for j in range(i+1, N):
            key = (urlids[i], urlids[j])
            val = corpus.cos_sim(articles[urlids[i]]['tfidf'],
                    articles[urlids[j]]['tfidf'])
            total += 1
            simvals[key] = val
            if key in checklist:
                flag = True
                pos += 1
            else:
                flag = False
                
            if articles[urlids[i]]['pubdate'] == None or \
                    articles[urlids[j]]['pubdate'] == None:
                datedelta = 0.0
            else:
                datedelta = math.fabs((articles[urlids[i]]['pubdate'] - \
                        articles[urlids[j]]['pubdate']).days)
            cutoff = cutoff_begin*0.01
            for x in range(cutoff_begin,cutoff_end):
                if datedelta < 14 and val >= cutoff:
                    if flag:
                        truepos[x] += 1
                    else:
                        #if cutoff >= 0.1 and cutoff < 0.11:
                        #    print "false pos:", key, cutoff
                        falsepos[x] += 1
                cutoff += 0.01
    print "pos:",pos
    best_cutoff = 0
    best_f1 = 0
    best_p = 0
    best_r = 0
    for cutoff in range(cutoff_begin, cutoff_end):
        if truepos[cutoff]==0 and falsepos[cutoff]==0:
            precision = 0
        else:
            precision = float(truepos[cutoff])/(truepos[cutoff]+falsepos[cutoff])
        if pos == 0:
            recall = 0
        else:
            recall = float(truepos[cutoff])/pos
        if precision == 0 and recall == 0:
            f1 = 0
        else:
            f1 = 2.0*(precision*recall)/(precision+recall)
        if f1 > best_f1:
            best_f1 = f1
            best_cutoff = cutoff
            best_p = precision
            best_r = recall
   
    print "cutoff:",best_cutoff*.01, "True Pos:", \
            truepos[best_cutoff], "False Pos:", falsepos[best_cutoff],\
            "precision:", best_p, "recall:", best_r, "f1:",best_f1
    print

    done = False
    for i in range(0, N-1):
        if done: break;
        for j in range(i+1, N):
            if done: break;
            key = (urlids[i], urlids[j])
            if key in notduplist_stored: continue
            val = corpus.cos_sim(articles[urlids[i]]['tfidf'],
                    articles[urlids[j]]['tfidf'])
            if key not in checklist and val >= (best_cutoff*0.01):
                if articles[urlids[i]]['pubdate'] == None or \
                        articles[urlids[j]]['pubdate'] == None:
                    datedelta = 0.0
                else:
                    datedelta = math.fabs((articles[urlids[i]]['pubdate'] - \
                            articles[urlids[j]]['pubdate']).days)
                if datedelta > 14: continue
                print "-- %s (%s)\n\n%s\n\n-- %s (%s)\n\n%s\n\n" % \
                        (articles[urlids[i]]['title'], str(articles[urlids[i]]['pubdate']),
                            summarizer.summarize_article(corpus, articles[urlids[i]], 4),
                            articles[urlids[j]]['title'], str(articles[urlids[j]]['pubdate']),
                            summarizer.summarize_article(corpus, articles[urlids[j]], 4))
                answer = raw_input("Duplicates? (y/n/q): ")
                if answer == "y" or answer == "Y":
                    duplist_stored.append(([key[0], key[1]], "duplist_stored"))
                if answer == "n" or answer == "N":
                    notduplist_stored.add(key)
                elif answer == "q" or answer == "Q":
                    done = True
                print "\n\n----------------\n\n"
    savepickle(paths['corpus.notduplist'], notduplist_stored)
    savepickle(paths['corpus.duplist'], duplist_stored)