def evaluate():
    corpus = AINewsCorpus()
    print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
    for filename in sorted(glob.glob("../../experiments/justext/*.true")):
        truetext = ents.convert(file(filename).read())
        truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
        truewords = re.split(r'\s+', truetext)
        urlid = filename[26:30]
        article = corpus.get_article(urlid)
        if article == None: continue
        articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
        articlewords = re.split(r'\s+', articletext)
        goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
        (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
        goosetext = ents.convert(stdout.encode('ascii'))
        goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
        goosewords = re.split(r'\s+', goosetext)
        ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
        ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
        print "%s,%d,%d,%d,%.4f,%.4f" % \
            (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
Example #2
0
 def evaluate(self, ident, inputdir):
     corpus = AINewsCorpus()
     (articles, _) = corpus.load_corpus(ident, 1.0, True)
     for (urlid,_,_) in articles:
         article = corpus.get_article(urlid, True)
         try:
             os.mkdir("%s/gold/%s" % (inputdir, urlid))
         except:
             pass
         f = open("%s/gold/%s/%s.fulltext" % (inputdir, urlid, urlid), 'w')
         f.write(article['content'])
         f.write("\n")
         f.close()
         f = open("%s/system/ots/%s.ots.system" % (inputdir, urlid), 'w')
         f.write("\n".join(self.summarize_single_ots(article)))
         f.write("\n")
         f.close()
         f = open("%s/system/tfidf/%s.tfidf.system" % (inputdir, urlid), 'w')
         f.write("\n".join(self.summarize_article(corpus, article, 4, False)))
         f.write("\n")
         f.close()
         print "Saved %s." % urlid