def search(self, keyword, num): key = self.create_key(keyword, num) value = memcache.get(key) if value is None: # MEMO: 1度だけ再試行する try: value = GoogleNews.search(keyword, num) except urlfetch.DownloadError: logging.info("retry download") value = GoogleNews.search(keyword, num) memcache.add(key, value, self.ttl) return value
def crawl_current_day(): gn = GoogleNews() r = Redis() gn.get_topnews() all_news = r.keys('page:*:title') p = HTMLParser() i = 1 total = 0 print F, "total pages: %d" % len(all_news) for key_news_title in all_news: newsid = key_news_title.split(':')[1] # only interested in news here if r.get('page:%s:type' % newsid) != 'news': continue # and pages not already crawled in its first day if r.get('page:%s:crawled_day' % newsid) is None: i += 1 # lang for stopwords remove if r.get('page:%s:locale' % newsid) == 'es_cl': lang = 'spanish' else: lang = 'english' news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore'))) news_title = remove_stopwords(news_title_stopwords, lang=lang) print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords print F, "searching tweets for news (w/o sw): \"%s\"" % news_title # mark its news' first day as searched r.incr('page:%s:crawled_day' % newsid) # search by title in twitter total += search_term(news_title, newsid) print F, "total news searched: %d" % i print F, "total tweets crawled: %d" % total
from article_manager import Article from article_manager import ArticleManager from keyword_manager import KeywordManager from google_news import GoogleNews from ironnews_utility import IronnewsUtility print "Content-Type: text/plain" print "" KeywordManager.initialize() keyword = KeywordManager.get() print keyword.encode("utf-8") articles = GoogleNews.search(keyword, 30) for article in articles: url = article["url"] title = article["title"] print "---" print url print title.encode("utf-8") if IronnewsUtility.reject(url): print "reject!" continue url2 = IronnewsUtility.get_canonical_url(url) if url2 != url: print "canonical! " + url2 ArticleManager.add(url2, title, Article.CATEGORY_RAIL)