def _require_session_owned(session_id): """ Require that the session is owned by the logged in user """ session_r = _session.find_one({'_id': bson.ObjectId(session_id)}) if not session_r: raise Exception('Session not found') search_r = _search.find_one({'_id': bson.ObjectId(session_r['search_id'])}) if not search_r: raise Exception('Search not found') if search_r['username'] != session.get('username', ''): raise Exception('You do not have permission to access this snapshot') return (search_r, session_r)
def filter(session_id): """ Get histogram and tweets @filter: comma-delimited list of elements to filter by if element starts with '#', then it is a hashtag else, it is a stem """ try: _require_session_access(session_id) session_r = _session.find_one( {'_id': bson.ObjectId(session_id)}) if not session_r: raise Exception('Session not found') search_r = _search.find_one( {'_id': bson.ObjectId(session_r['search_id'])}) if not search_r: raise Exception('Search not found') # Find tweets params = {'session_id': session_id} filter = request.args.getlist('filter[]') filter_stems = [] filter_hashtags = [] filter_urls = [] for element in filter: if element.startswith('#'): filter_hashtags.append(element) elif element.startswith('http'): filter_urls.append(element) else: filter_stems.append(element) if filter_urls: params['urls'] = {'$all': filter_urls} if filter_stems: params['stems'] = {'$all': filter_stems} if filter_hashtags: params['hashtags'] = {'$all': filter_hashtags} cursor = _tweets.find(params, { 'embed': 1, 'id_str': 1, 'created_at': 1, 'user.name': 1, 'user.screen_name': 1, 'retweeted_status.id_str': 1, 'stems': 1, 'hashtags': 1, 'urls': 1 }, sort=[('dt', pymongo.DESCENDING)]) # Process tweets stem_counter = Counter() hashtag_counter = Counter() url_counter = Counter() tweets = [] retweets = 0 id_set = set() for tweet in cursor: stem_counter.update(tweet['stems']) hashtag_counter.update(tweet['hashtags']) url_counter.update(tweet['urls']) if tweet['id_str'] in id_set: retweets += 1 continue id_set.add(tweet['id_str']) if 'retweeted_status' in tweet: retweeted_id = tweet['retweeted_status']['id_str'] if retweeted_id in id_set: retweets += 1 continue id_set.add(retweeted_id) tweets.append({ 'text': tweet['embed'], 'user_name': tweet['user']['name'], 'user_screen_name': tweet['user']['screen_name'], 'id_str': tweet['id_str'], 'created_at': tweet['created_at'] }) stem_counts = [x for x in stem_counter.most_common() \ if x[0] not in filter_stems] hashtag_counts = [x for x in hashtag_counter.most_common() \ if x[0] not in filter_hashtags] url_counts = [x for x in url_counter.most_common() \ if x[0] not in filter_urls] return _jsonify( search=search_r, session=session_r, stem_counts=stem_counts, hashtag_counts=hashtag_counts, url_counts=url_counts, tweets=tweets, retweets=retweets ) except Exception, e: traceback.print_exc() return _jsonify(error=str(e))
def analyze(): """ Get tweets from twitter and analyze them @language = language code, e.g. 'en' @query = query string OR @list_id = list id """ try: language = request.args.get('language') or 'en' query = request.args.get('query') list_id = request.args.get('list_id') if query: query_lower = query.lower() elif list_id: list_map = _get_list_map() else: raise Exception('No query or list specified') # Get api object api = tweepy.API(get_oauth()) # Get/create search record param = { 'username': session['username'], 'language': language } if query: param['query_lower'] = query_lower else: param['list_id'] = list_id search_r = _search.find_one(param) if not search_r: search_r = param if query: search_r['query'] = query else: search_r['list_name'] = list_map[list_id] search_r['_id'] = _search.save(search_r, manipulate=True) search_id = str(search_r['_id']) # Create new search session session_r = { 'search_id': search_id, 'dt': datetime.datetime.now().isoformat(), 'stem_counts': [], # [[stem, post count]] 'stem_map': {}, # {stem: [term, count]} } session_r['_id'] = _session.save(session_r, manipulate=True) session_id = str(session_r['_id']) # Process tweets stopwords = extract.get_stopwords(language).copy() stoptags = set() stemmer = extract.get_stemmer(language) stem_map = defaultdict(Counter) tweet_list = [] if query: stoptags.update([x.lower().lstrip('#') for x in query_lower.split()]) stopwords.update(stoptags) cursor = tweepy.Cursor(api.search, q=query, lang=language, \ count=100, result_type='recent', include_entities=True) else: cursor = tweepy.Cursor(api.list_timeline, list_id=list_id, \ count=100, include_entities=True) for tweet in cursor.items(limit=settings.TWITTER_SEARCH_LIMIT): tweet_dict = twutil.tweepy_model_to_dict(tweet) tweet_dict['session_id'] = session_id tweet_dict['embed'] = twutil.format_text(tweet_dict) tweet_dict['tokens'] = extract.tokenize(tweet_dict['text']) # Filter hashtags from query # tweet_dict['hashtags'] = list(set(['#'+x['text'].lower() \ # for x in tweet_dict['entities']['hashtags']])) tweet_dict['hashtags'] = list(set([ '#'+x['text'].lower() \ for x in tweet_dict['entities']['hashtags'] \ if x['text'].lower() not in stoptags ])) tweet_dict['urls'] = list(set([x['expanded_url'] \ for x in tweet_dict['entities']['urls']])) tweet_list.append(tweet_dict) # ------------------------------------------------------------ # Process trigrams trigram_counter = Counter() for tweet in tweet_list: grams = [] for tokens in tweet['tokens']: for g in nltk.ngrams(tokens, 3): if extract.stoplist_iter(g, stopwords): continue if g[0].startswith('@') or g[1].startswith('@') or g[2].startswith('@'): continue grams.append(g) stems = extract.stems_from_grams(grams, stemmer) for s, g in zip(stems, grams): stem_map[s].update([g]) tweet['stems_3'] = list(set(stems)) trigram_counter.update(tweet['stems_3']) # Ignore trigrams that only appear once for g, n in trigram_counter.items(): if n < 2: del trigram_counter[g] del stem_map[g] # ------------------------------------------------------------ # Process bigrams bigram_counter = Counter() for tweet in tweet_list: grams = [] stems = [] for tokens in tweet['tokens']: gram_list = nltk.ngrams(tokens, 2) stem_list = extract.stems_from_grams(gram_list, stemmer) last_i = len(gram_list) - 1 for i, g in enumerate(gram_list): if extract.stoplist_iter(g, stopwords): continue if g[0].startswith('@') or g[1].startswith('@'): continue # Filter by trigrams if i > 0 and \ (stem_list[i-1][0], stem_list[i][0], stem_list[i][1]) in trigram_counter: continue if i < last_i and \ (stem_list[i][0], stem_list[i][1], stem_list[i+1][1]) in trigram_counter: continue grams.append(g) stems.append(stem_list[i]) for s, g in zip(stems, grams): stem_map[s].update([g]) tweet['stems_2'] = list(set(stems)) bigram_counter.update(tweet['stems_2']) # Ignore bigrams that only appear once for g, n in bigram_counter.items(): if n < 2: del bigram_counter[g] del stem_map[g] # ------------------------------------------------------------ # Process unigrams for tweet in tweet_list: grams = [] stems = [] for tokens in tweet['tokens']: gram_list = nltk.ngrams(tokens, 1) stem_list = extract.stems_from_grams(gram_list, stemmer) last_i = len(gram_list) - 1 for i, g in enumerate(gram_list): if extract.stoplist_iter(g, stopwords): continue # Filter bigram terms if i > 0 and \ (stem_list[i-1][0], stem_list[i][0]) in bigram_counter: continue if i < last_i and \ (stem_list[i][0], stem_list[i+1][0]) in bigram_counter: continue # Filter trigram terms if i > 1 and \ (stem_list[i-2][0], stem_list[i-1][0], stem_list[i][0]) in trigram_counter: continue if i > 0 and i < last_i and \ (stem_list[i-1][0], stem_list[i][0], stem_list[i+1][0]) in trigram_counter: continue if i < (last_i - 1) and \ (stem_list[i][0], stem_list[i+1][0], stem_list[i+2][0]) in trigram_counter: continue grams.append(g) stems.append(stem_list[i]) for s, g in zip(stems, grams): stem_map[s].update([g]) # Process stems tweet['stems'] = [' '.join(x) for x in set(stems)] tweet['stems'].extend([' '.join(x) for x in tweet['stems_2'] if x in bigram_counter]) tweet['stems'].extend([' '.join(x) for x in tweet['stems_3'] if x in trigram_counter]) del tweet['stems_2'] del tweet['stems_3'] # Update session for stem, c in stem_map.iteritems(): session_r['stem_map'][' '.join(stem)] = \ [' '.join(k) for k, v in c.most_common()] # Save tweets if tweet_list: _tweets.insert(tweet_list) session_r['tweet_count'] = len(tweet_list) _session.save(session_r) return _jsonify(session=session_r) except tweepy.TweepError, e: traceback.print_exc() return _jsonify(error=e.message[0]['message'])
def analyze(): """ Get tweets from twitter and analyze them @language = language code, e.g. 'en' @query = query string OR @list_id = list id """ try: language = request.args.get('language') or 'en' query = request.args.get('query') list_id = request.args.get('list_id') if query: query_lower = query.lower() elif list_id: list_map = _get_list_map() else: raise Exception('No query or list specified') # Get api object api = tweepy.API(get_oauth()) # Get/create search record param = { 'username': session['username'], 'language': language } if query: param['query_lower'] = query_lower else: param['list_id'] = list_id search_r = _search.find_one(param) if not search_r: search_r = param if query: search_r['query'] = query else: search_r['list_name'] = list_map[list_id] search_r['_id'] = _search.save(search_r, manipulate=True) search_id = str(search_r['_id']) # Create search session session_r = { 'search_id': search_id, 'dt': datetime.datetime.now().isoformat(), 'stem_counts': [], # [[stem, post count]] 'stem_map': {}, # {stem: [term, count]} } session_r['_id'] = _session.save(session_r, manipulate=True) session_id = str(session_r['_id']) # Process tweets stopwords = extract.get_stopwords(language).copy() stoptags = set() stemmer = extract.get_stemmer(language) stem_map = defaultdict(Counter) tweet_list = [] if query: stoptags.update([x.lower().lstrip('#') for x in query_lower.split()]) stopwords.update(stoptags) cursor = tweepy.Cursor(api.search, q=query, lang=language, \ count=100, result_type='recent', include_entities=True) else: cursor = tweepy.Cursor(api.list_timeline, list_id=list_id, \ count=100, include_entities=True) for tweet in cursor.items(limit=settings.TWITTER_SEARCH_LIMIT): tweet_dict = twutil.tweepy_model_to_dict(tweet) tweet_dict['session_id'] = session_id tweet_dict['embed'] = twutil.format_text(tweet_dict) tweet_dict['tokens'] = extract.tokenize(tweet_dict['text']) # Filter hashtags from query # tweet_dict['hashtags'] = list(set(['#'+x['text'].lower() \ # for x in tweet_dict['entities']['hashtags']])) tweet_dict['hashtags'] = list(set([ '#'+x['text'].lower() \ for x in tweet_dict['entities']['hashtags'] \ if x['text'].lower() not in stoptags ])) tweet_dict['urls'] = list(set([x['expanded_url'] \ for x in tweet_dict['entities']['urls']])) tweet_list.append(tweet_dict) # ------------------------------------------------------------ # Process trigrams trigram_counter = Counter() for tweet in tweet_list: grams = [] for tokens in tweet['tokens']: for g in nltk.ngrams(tokens, 3): if extract.stoplist_iter(g, stopwords): continue if g[0].startswith('@') or g[1].startswith('@') or g[2].startswith('@'): continue grams.append(g) stems = extract.stems_from_grams(grams, stemmer) for s, g in zip(stems, grams): stem_map[s].update([g]) tweet['stems_3'] = list(set(stems)) trigram_counter.update(tweet['stems_3']) # Ignore trigrams that only appear once for g, n in trigram_counter.items(): if n < 2: del trigram_counter[g] del stem_map[g] # ------------------------------------------------------------ # Process bigrams bigram_counter = Counter() for tweet in tweet_list: grams = [] stems = [] for tokens in tweet['tokens']: gram_list = nltk.ngrams(tokens, 2) stem_list = extract.stems_from_grams(gram_list, stemmer) last_i = len(gram_list) - 1 for i, g in enumerate(gram_list): if extract.stoplist_iter(g, stopwords): continue if g[0].startswith('@') or g[1].startswith('@'): continue # Filter by trigrams if i > 0 and \ (stem_list[i-1][0], stem_list[i][0], stem_list[i][1]) in trigram_counter: continue if i < last_i and \ (stem_list[i][0], stem_list[i][1], stem_list[i+1][1]) in trigram_counter: continue grams.append(g) stems.append(stem_list[i]) for s, g in zip(stems, grams): stem_map[s].update([g]) tweet['stems_2'] = list(set(stems)) bigram_counter.update(tweet['stems_2']) # Ignore bigrams that only appear once for g, n in bigram_counter.items(): if n < 2: del bigram_counter[g] del stem_map[g] # ------------------------------------------------------------ # Process unigrams for tweet in tweet_list: grams = [] stems = [] for tokens in tweet['tokens']: gram_list = nltk.ngrams(tokens, 1) stem_list = extract.stems_from_grams(gram_list, stemmer) last_i = len(gram_list) - 1 for i, g in enumerate(gram_list): if extract.stoplist_iter(g, stopwords): continue # Filter bigram terms if i > 0 and \ (stem_list[i-1][0], stem_list[i][0]) in bigram_counter: continue if i < last_i and \ (stem_list[i][0], stem_list[i+1][0]) in bigram_counter: continue # Filter trigram terms if i > 1 and \ (stem_list[i-2][0], stem_list[i-1][0], stem_list[i][0]) in trigram_counter: continue if i > 0 and i < last_i and \ (stem_list[i-1][0], stem_list[i][0], stem_list[i+1][0]) in trigram_counter: continue if i < (last_i - 1) and \ (stem_list[i][0], stem_list[i+1][0], stem_list[i+2][0]) in trigram_counter: continue grams.append(g) stems.append(stem_list[i]) for s, g in zip(stems, grams): stem_map[s].update([g]) # Process stems tweet['stems'] = [' '.join(x) for x in set(stems)] tweet['stems'].extend([' '.join(x) for x in tweet['stems_2'] if x in bigram_counter]) tweet['stems'].extend([' '.join(x) for x in tweet['stems_3'] if x in trigram_counter]) del tweet['stems_2'] del tweet['stems_3'] # Update session for stem, c in stem_map.iteritems(): session_r['stem_map'][' '.join(stem)] = \ [' '.join(k) for k, v in c.most_common()] # Save tweets if tweet_list: _tweets.insert(tweet_list) session_r['tweet_count'] = len(tweet_list) _session.save(session_r) return _jsonify(session=session_r) except tweepy.TweepError, e: traceback.print_exc() return _jsonify(error=e.message[0]['message'])