def evaluate(): corpus = AINewsCorpus() print "urlid,length truewords,length justext,length goose,ld justtext,ld goose" for filename in sorted(glob.glob("../../experiments/justext/*.true")): truetext = ents.convert(file(filename).read()) truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False)) truewords = re.split(r'\s+', truetext) urlid = filename[26:30] article = corpus.get_article(urlid) if article == None: continue articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False)) articlewords = re.split(r'\s+', articletext) goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url'] (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate() goosetext = ents.convert(stdout.encode('ascii')) goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False)) goosewords = re.split(r'\s+', goosetext) ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords)) ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords)) print "%s,%d,%d,%d,%.4f,%.4f" % \ (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
def get_article(self, urlid, corpus = False): row = None if corpus: table = 'cat_corpus' cat_table = 'cat_corpus_cats' row = self.db.selectone("""select u.url, u.title, u.content from %s as u where u.urlid = %s""" % (table, urlid)) else: table = 'urllist' cat_table = 'categories' row = self.db.selectone("""select u.url, u.title, u.content, u.summary, u.pubdate, u.crawldate, u.processed, u.published, u.publisher from %s as u where u.urlid = %s""" % \ (table, urlid)) if row != None and row[2] is not None: wordfreq = self.txtpro.simpletextprocess(urlid, row[2]) summary = "" if not corpus: summary = row[3] processed = False if not corpus and row[6] == 1: processed = True published = False if not corpus and row[7] == 1: published = True pubdate = "" if not corpus: pubdate = row[4] crawldate = "" if not corpus: crawldate = row[5] publisher = "" if not corpus: publisher = row[8] categories = [] cat_rows = self.db.selectall("""select category from %s where urlid = %s""" % (cat_table, urlid)) for cat_row in cat_rows: categories.append(cat_row[0]) return {'urlid': urlid, 'url': row[0], 'title': row[1], 'content': trunc(row[2], max_pos=3000), 'content_all': row[2], 'summary': summary, 'pubdate': pubdate, 'crawldate': crawldate, 'processed': processed, 'published': published, 'publisher': publisher, 'categories': categories, 'duplicates': [], 'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)} else: return None
def fetch_all_articles(self): try: os.makedirs(paths['ainews.content_tmp']) except: pass f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w') for article in self.articles: f.write("%s\n" % article['url']) f.close() goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \ (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp']) Popen(goose_cmd, shell = True).communicate() i = 0 for article in self.articles: print "READ: Opening", ("%s%d" % (paths['ainews.content_tmp'], i)) f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i), encoding='utf-8') rows = f.read().split("\n") f.close() #os.remove("%s%d" % (paths['ainews.content_tmp'], i)) # don't move this; have to ensure the increment occurs! i += 1 if self.db.crawled(article['url']): print "READ: .. Ignoring; already in crawled database." continue if len(rows) < 3: print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \ (article['url'], rows) continue self.db.set_crawled(article['url']) content = ' '.join(rows[:-2]) content = convert_to_printable(cgi.escape(re.sub(r'\s+', ' ', content))).strip() content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '', content) content = re.sub(r'\s*Share this\s*', '', content) content = re.sub(r'\s+,\s+', ', ', content) content = re.sub(r'\s+\.', '.', content) # shorten content to (presumably) ignore article comments content = trunc(content, max_pos=5000) article['content'] = content article['summary'] = self.summarizer.summarize_first_two_sentences(article['content']) print "SUMRY: ..", article['summary'] article['image_url'] = convert_to_printable(rows[-2]).strip() if len(article['title']) < 5 or len(article['content']) < 1000: print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \ (article['title'], article['content']) continue # remove content with blacklisted words found_blacklist_word = False for word in blacklist_words: if re.search("\W%s\W" % word, article['content'], re.IGNORECASE) != None: print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word found_blacklist_word = True break if found_blacklist_word: continue urlid = self.put_in_db(article) if urlid == None: continue print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \ (urlid, i, article['title'], str(article['pubdate']), article['source'])
def crawl(self, opts): """ Crawl the news by source lists (Search page or RSS). """ rows = self.get_newssources(opts) for row in rows: sourcepage_url = row[0] publisher = row[1] sourcetype = row[2] tag = row[3] parser = ParserFactory(publisher, sourcetype) if parser == None: continue if self.debug: print "Crawling %s (%s):" % (publisher, tag) try: parser.parse_sourcepage(sourcepage_url) parser.parse_storypage() for candidate in parser.candidates: if len(candidate) != 4: continue url = candidate[0].encode('utf-8') print "Fetching", url title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip() # if publisher is GoogleNews, extract true publisher from title if publisher == "GoogleNews": print title true_publisher = re.match(r'^.* - (.+)$', title).group(1) true_publisher = "%s via Google News" % true_publisher elif publisher == "UserSubmitted": true_publisher = re.match(r'^[^\/]+:\/\/([^\/]+)(?::\d+)?\/?.*$', url).group(1) true_publisher = "%s (User submitted)" % true_publisher else: true_publisher = publisher # removing site title like " - NPR" title = re.sub(r'\s+[:-]\s+.*$', '', title) pubdate = candidate[2] content = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[3])))).strip() if isinstance(title, types.StringType): title = unicode(title, errors = 'ignore') if isinstance(content, types.StringType): content = unicode(content, errors = 'ignore') content = re.sub("\\s*%s\\s*" % re.escape(title), '', content) content = re.sub(r'\s*Share this\s*', '', content) content = re.sub(r'\s+,\s+', ', ', content) content = re.sub(r'\s+\.', '.', content) if len(title) < 5 or len(content) < 2000: print "Content or title too short" continue # shorten content to (presumably) ignore article comments content = trunc(content, max_pos=3000) # remove content with blacklisted words found_blacklist_word = False for word in blacklist_words: if re.search("\W%s\W" % word, content, re.IGNORECASE) != None: print "Found blacklisted word \"%s\", ignoring article." % word found_blacklist_word = True break if found_blacklist_word: continue urlid = self.put_in_db(url, pubdate, self.today, true_publisher, \ tag, title, content) if urlid == None: continue try: print "{ID:%d} %s (%s, %s)" % (urlid, title, str(pubdate), true_publisher) except: pass except (KeyboardInterrupt): if self.debug: print "Quitting early due to keyboard interrupt." sys.exit() except: if self.debug: print "Parser for %s failed." % (publisher) print traceback.print_exc() continue;
def fetch_all_articles(self): try: os.makedirs(paths['ainews.content_tmp']) except: pass f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w') for article in self.articles: f.write("%s\n" % article['url']) f.close() goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \ (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp']) Popen(goose_cmd, shell=True).communicate() i = 0 for article in self.articles: print "READ: Opening", ("%s%d" % (paths['ainews.content_tmp'], i)) f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i), encoding='utf-8') rows = f.read().split("\n") f.close() #os.remove("%s%d" % (paths['ainews.content_tmp'], i)) # don't move this; have to ensure the increment occurs! i += 1 if self.db.crawled(article['url']): print "READ: .. Ignoring; already in crawled database." continue if len(rows) < 3: print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \ (article['url'], rows) continue self.db.set_crawled(article['url']) content = ' '.join(rows[:-2]) content = convert_to_printable( cgi.escape(re.sub(r'\s+', ' ', content))).strip() content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '', content) content = re.sub(r'\s*Share this\s*', '', content) content = re.sub(r'\s+,\s+', ', ', content) content = re.sub(r'\s+\.', '.', content) # shorten content to (presumably) ignore article comments content = trunc(content, max_pos=5000) article['content'] = content article['summary'] = self.summarizer.summarize_first_two_sentences( article['content']) print "SUMRY: ..", article['summary'] article['image_url'] = convert_to_printable(rows[-2]).strip() if len(article['title']) < 5 or len(article['content']) < 1000: print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \ (article['title'], article['content']) continue # remove content with blacklisted words found_blacklist_word = False for word in blacklist_words: if re.search("\W%s\W" % word, article['content'], re.IGNORECASE) != None: print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word found_blacklist_word = True break if found_blacklist_word: continue urlid = self.put_in_db(article) if urlid == None: continue print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \ (urlid, i, article['title'], str(article['pubdate']), article['source'])