def crawl_tweets_for_event(event_id): r = Redis() p = HTMLParser() total_tweets = 0 event_title = r.get("festival:%s:title" % event_id).decode("utf-8", errors="ignore") event_title = strip_accents(event_title) event_title = p.unescape(event_title) event_title = remove_stopwords(event_title) artists = r.get("festival:%s:artists" % event_id) for k, v in eval(artists).items(): if type(v) == list: for artist in v: print F, "searching tweets for %s %s" % (k, artist) total_tweets += search_term(artist) elif type(v) == str: print F, "searching tweets for %s %s" % (k, v) total_tweets += search_term(v) r.incr("festival:%s:crawled_times" % event_id) print F, "searching tweets for festival title: %s" % event_title total_tweets += search_term(event_title, event_id) # newsid print F, "total tweets: %d" % total_tweets
def enrich_news(redis): keys = redis.keys('news:*:id') if len(keys) == 0: return news_tweets = [] pages = [] for key in keys: locale = redis.get('news:%s:locale' % key.split(':')[1]) if locale == 'en_us': lang = 'english' else: lang = 'spanish' id = key.split(':')[1] terms = get_search_terms_news(redis, id, lang) for term in terms: tweets, pages = twitter.search_term(term) for tweet in tweets: tweet.event_id = id news_tweets.extend(tweets) pages.extend(pages) print tag, "got", len(news_tweets), 'tweets for news' return (news_tweets, pages)
def enrich_festivals(redis): import datetime keys = redis.keys('festival:*:startDate') to_search_keys = [] # solo buscar en los festivales que estan pasando ahora for key in keys: startDate = redis.get(key) id = key.split(':')[1] startDate = datetime.datetime.strptime(startDate, '%a, %d %b %Y %H:%M:%S') if datetime.datetime.today() >= startDate: to_search_keys.append(key) if len(keys) == 0: return festivals_tweets = [] pages = [] for key in to_search_keys: id = key.split(':')[1] terms = get_search_terms_festivals(redis, id) for term in terms: tweets, pages = twitter.search_term(term) for tweet in tweets: tweet.event_id = id festivals_tweets.extend(tweets) pages.extend(pages) print tag, "got", len(festivals_tweets), 'tweets for festivals' return (festivals_tweets, pages)
def crawl_week_later(): gn = GoogleNews() r = Redis() all_news = r.keys('page:*:title') i = 1 total = 0 print F, "total news: %d" % len(all_news) for key_news_title in all_news: newsid = key_news_title.split(':')[1] if r.get('page:%s:crawled_week' % newsid) is None and r.get('page:%s:crawled_day' % newsid) is not None: i += 1 if r.get('page:%s:locale' % newsid) == 'es_cl': lang = 'spanish' else: lang = 'english' news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore'))) news_title = remove_stopwords(news_title_stopwords, lang=lang) print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords print F, "searching tweets for news (w/o sw): \"%s\"" % news_title print F, "searching tweets for news: %s" % news_title total += search_term(news_title, newsid) # search by title print F, "total news searched: %d" % i print F, "total tweets crawled: %d" % total
def enrich_event(redis_key): tag = '[events_enricher]' redis = Redis() queries = list(set(redis.lrange(redis_key, 0, -1))) tweets = [] for query in queries: #print tag, 'searching "%s"' % query.decode('utf-8', errors='ignore') tweets.extend(search_term(query.decode('utf-8', errors='ignore'))) print tag, 'got', len(tweets), 'tweets' event_id = redis_key.split(':')[1] save_tweets(tweets, event_id)
def crawl_current_day(): gn = GoogleNews() r = Redis() gn.get_topnews() all_news = r.keys('page:*:title') p = HTMLParser() i = 1 total = 0 print F, "total pages: %d" % len(all_news) for key_news_title in all_news: newsid = key_news_title.split(':')[1] # only interested in news here if r.get('page:%s:type' % newsid) != 'news': continue # and pages not already crawled in its first day if r.get('page:%s:crawled_day' % newsid) is None: i += 1 # lang for stopwords remove if r.get('page:%s:locale' % newsid) == 'es_cl': lang = 'spanish' else: lang = 'english' news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore'))) news_title = remove_stopwords(news_title_stopwords, lang=lang) print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords print F, "searching tweets for news (w/o sw): \"%s\"" % news_title # mark its news' first day as searched r.incr('page:%s:crawled_day' % newsid) # search by title in twitter total += search_term(news_title, newsid) print F, "total news searched: %d" % i print F, "total tweets crawled: %d" % total