コード例 #1
0
ファイル: news_crawler.py プロジェクト: mquezada/cc6909
def crawl_week_later():
	gn = GoogleNews()
	r = Redis()
	
	all_news = r.keys('page:*:title')

	i = 1
	total = 0
	print F, "total news: %d" % len(all_news)	
	for key_news_title in all_news:
		newsid = key_news_title.split(':')[1] 

		if r.get('page:%s:crawled_week' % newsid) is None and r.get('page:%s:crawled_day' % newsid) is not None:
			i += 1

			if r.get('page:%s:locale' % newsid) == 'es_cl':
				lang = 'spanish'
			else: 
				lang = 'english'

			news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore')))
			news_title = remove_stopwords(news_title_stopwords, lang=lang)
			print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords			
			print F, "searching tweets for news (w/o sw): \"%s\"" % news_title

			print F, "searching tweets for news: %s" % news_title 
			total += search_term(news_title, newsid) # search by title
	print F, "total news searched: %d" % i
	print F, "total tweets crawled: %d" % total
コード例 #2
0
ファイル: festivals_crawler.py プロジェクト: mquezada/cc6909
def crawl_tweets_for_event(event_id):
    r = Redis()
    p = HTMLParser()

    total_tweets = 0

    event_title = r.get("festival:%s:title" % event_id).decode("utf-8", errors="ignore")
    event_title = strip_accents(event_title)
    event_title = p.unescape(event_title)
    event_title = remove_stopwords(event_title)

    artists = r.get("festival:%s:artists" % event_id)

    for k, v in eval(artists).items():
        if type(v) == list:
            for artist in v:
                print F, "searching tweets for %s %s" % (k, artist)
                total_tweets += search_term(artist)
        elif type(v) == str:
            print F, "searching tweets for %s %s" % (k, v)
            total_tweets += search_term(v)

    r.incr("festival:%s:crawled_times" % event_id)

    print F, "searching tweets for festival title: %s" % event_title
    total_tweets += search_term(event_title, event_id)  # newsid
    print F, "total tweets: %d" % total_tweets
コード例 #3
0
ファイル: news_crawler.py プロジェクト: mquezada/cc6909
def crawl_current_day():
	gn = GoogleNews()
	r = Redis()

	gn.get_topnews()

	all_news = r.keys('page:*:title')
	p = HTMLParser()

	i = 1
	total = 0
	print F, "total pages: %d" % len(all_news)
	for key_news_title in all_news:
		newsid = key_news_title.split(':')[1] 

		# only interested in news here
		if r.get('page:%s:type' % newsid) != 'news':
			continue

		# and pages not already crawled in its first day
		if r.get('page:%s:crawled_day' % newsid) is None:			
			i += 1

			# lang for stopwords remove
			if r.get('page:%s:locale' % newsid) == 'es_cl':
				lang = 'spanish'
			else: 
				lang = 'english'

			news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore')))
			news_title = remove_stopwords(news_title_stopwords, lang=lang)

			print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords			
			print F, "searching tweets for news (w/o sw): \"%s\"" % news_title

			# mark its news' first day as searched
			r.incr('page:%s:crawled_day' % newsid)

			# search by title in twitter
			total += search_term(news_title, newsid) 

	print F, "total news searched: %d" % i
	print F, "total tweets crawled: %d" % total