def evaluate(self, ident, pct): for i in range(1): results = {} (train_corpus, predict_corpus) = self.corpus.load_corpus(ident, float(pct), True, True) savepickle(paths['svm.svm_data_tmp']+'wordids.pkl', self.corpus.wordids) self.generate_libsvm_input(train_corpus, 'train') self.generate_libsvm_input(predict_corpus, 'predict') print "Done generating SVM input." results = self.libsvm_train(True) print "Iteration", i, ", pct", pct print results
def recallprecision(articles): """ recall and precision for simularity method, make all pairs from urllist and compute the similarity. """ urlids = articles.keys() truepos = {} falsepos = {} # cutoff range [0.01, 1.0] cutoff_begin = 0 cutoff_end = 100 for cutoff in range(cutoff_begin, cutoff_end): truepos[cutoff]=0 falsepos[cutoff]=0 total = 0 pos = 0 N = len(urlids) print "Number of articles: %d" % N progress_step = N*(N-1)/200.0 simvals = {} for i in range(0, N-1): if (i % (N/10)) == 0: print "Progress: %.0f%%" % (float(total)/progress_step) for j in range(i+1, N): key = (urlids[i], urlids[j]) val = corpus.cos_sim(articles[urlids[i]]['tfidf'], articles[urlids[j]]['tfidf']) total += 1 simvals[key] = val if key in checklist: flag = True pos += 1 else: flag = False if articles[urlids[i]]['pubdate'] == None or \ articles[urlids[j]]['pubdate'] == None: datedelta = 0.0 else: datedelta = math.fabs((articles[urlids[i]]['pubdate'] - \ articles[urlids[j]]['pubdate']).days) cutoff = cutoff_begin*0.01 for x in range(cutoff_begin,cutoff_end): if datedelta < 14 and val >= cutoff: if flag: truepos[x] += 1 else: #if cutoff >= 0.1 and cutoff < 0.11: # print "false pos:", key, cutoff falsepos[x] += 1 cutoff += 0.01 print "pos:",pos best_cutoff = 0 best_f1 = 0 best_p = 0 best_r = 0 for cutoff in range(cutoff_begin, cutoff_end): if truepos[cutoff]==0 and falsepos[cutoff]==0: precision = 0 else: precision = float(truepos[cutoff])/(truepos[cutoff]+falsepos[cutoff]) if pos == 0: recall = 0 else: recall = float(truepos[cutoff])/pos if precision == 0 and recall == 0: f1 = 0 else: f1 = 2.0*(precision*recall)/(precision+recall) if f1 > best_f1: best_f1 = f1 best_cutoff = cutoff best_p = precision best_r = recall print "cutoff:",best_cutoff*.01, "True Pos:", \ truepos[best_cutoff], "False Pos:", falsepos[best_cutoff],\ "precision:", best_p, "recall:", best_r, "f1:",best_f1 print done = False for i in range(0, N-1): if done: break; for j in range(i+1, N): if done: break; key = (urlids[i], urlids[j]) if key in notduplist_stored: continue val = corpus.cos_sim(articles[urlids[i]]['tfidf'], articles[urlids[j]]['tfidf']) if key not in checklist and val >= (best_cutoff*0.01): if articles[urlids[i]]['pubdate'] == None or \ articles[urlids[j]]['pubdate'] == None: datedelta = 0.0 else: datedelta = math.fabs((articles[urlids[i]]['pubdate'] - \ articles[urlids[j]]['pubdate']).days) if datedelta > 14: continue print "-- %s (%s)\n\n%s\n\n-- %s (%s)\n\n%s\n\n" % \ (articles[urlids[i]]['title'], str(articles[urlids[i]]['pubdate']), summarizer.summarize_article(corpus, articles[urlids[i]], 4), articles[urlids[j]]['title'], str(articles[urlids[j]]['pubdate']), summarizer.summarize_article(corpus, articles[urlids[j]], 4)) answer = raw_input("Duplicates? (y/n/q): ") if answer == "y" or answer == "Y": duplist_stored.append(([key[0], key[1]], "duplist_stored")) if answer == "n" or answer == "N": notduplist_stored.add(key) elif answer == "q" or answer == "Q": done = True print "\n\n----------------\n\n" savepickle(paths['corpus.notduplist'], notduplist_stored) savepickle(paths['corpus.duplist'], duplist_stored)