def evaluate(): corpus = AINewsCorpus() print "urlid,length truewords,length justext,length goose,ld justtext,ld goose" for filename in sorted(glob.glob("../../experiments/justext/*.true")): truetext = ents.convert(file(filename).read()) truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False)) truewords = re.split(r'\s+', truetext) urlid = filename[26:30] article = corpus.get_article(urlid) if article == None: continue articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False)) articlewords = re.split(r'\s+', articletext) goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url'] (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate() goosetext = ents.convert(stdout.encode('ascii')) goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False)) goosewords = re.split(r'\s+', goosetext) ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords)) ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords)) print "%s,%d,%d,%d,%.4f,%.4f" % \ (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
def evaluate(self, ident, inputdir): corpus = AINewsCorpus() (articles, _) = corpus.load_corpus(ident, 1.0, True) for (urlid,_,_) in articles: article = corpus.get_article(urlid, True) try: os.mkdir("%s/gold/%s" % (inputdir, urlid)) except: pass f = open("%s/gold/%s/%s.fulltext" % (inputdir, urlid, urlid), 'w') f.write(article['content']) f.write("\n") f.close() f = open("%s/system/ots/%s.ots.system" % (inputdir, urlid), 'w') f.write("\n".join(self.summarize_single_ots(article))) f.write("\n") f.close() f = open("%s/system/tfidf/%s.tfidf.system" % (inputdir, urlid), 'w') f.write("\n".join(self.summarize_article(corpus, article, 4, False))) f.write("\n") f.close() print "Saved %s." % urlid