def search(self, keyword, num):
   key   = self.create_key(keyword, num)
   value = memcache.get(key)
   if value is None:
     # MEMO: 1度だけ再試行する
     try:
       value = GoogleNews.search(keyword, num)
     except urlfetch.DownloadError:
       logging.info("retry download")
       value = GoogleNews.search(keyword, num)
     memcache.add(key, value, self.ttl)
   return value
Beispiel #2
0
def crawl_current_day():
	gn = GoogleNews()
	r = Redis()

	gn.get_topnews()

	all_news = r.keys('page:*:title')
	p = HTMLParser()

	i = 1
	total = 0
	print F, "total pages: %d" % len(all_news)
	for key_news_title in all_news:
		newsid = key_news_title.split(':')[1] 

		# only interested in news here
		if r.get('page:%s:type' % newsid) != 'news':
			continue

		# and pages not already crawled in its first day
		if r.get('page:%s:crawled_day' % newsid) is None:			
			i += 1

			# lang for stopwords remove
			if r.get('page:%s:locale' % newsid) == 'es_cl':
				lang = 'spanish'
			else: 
				lang = 'english'

			news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore')))
			news_title = remove_stopwords(news_title_stopwords, lang=lang)

			print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords			
			print F, "searching tweets for news (w/o sw): \"%s\"" % news_title

			# mark its news' first day as searched
			r.incr('page:%s:crawled_day' % newsid)

			# search by title in twitter
			total += search_term(news_title, newsid) 

	print F, "total news searched: %d" % i
	print F, "total tweets crawled: %d" % total
from article_manager import Article
from article_manager import ArticleManager
from keyword_manager import KeywordManager
from google_news import GoogleNews

from ironnews_utility import IronnewsUtility

print "Content-Type: text/plain"
print ""

KeywordManager.initialize()

keyword = KeywordManager.get()
print keyword.encode("utf-8")

articles = GoogleNews.search(keyword, 30)
for article in articles:
  url   = article["url"]
  title = article["title"]
  print "---"
  print url
  print title.encode("utf-8")
  if IronnewsUtility.reject(url):
    print "reject!"
    continue

  url2 = IronnewsUtility.get_canonical_url(url)
  if url2 != url:
    print "canonical! " + url2

  ArticleManager.add(url2, title, Article.CATEGORY_RAIL)